From b92a42a494b254635a1479a5f9acb0936eeaf835 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Wed, 22 Jun 2022 19:41:38 +0200
Subject: [PATCH 001/196] Clean historuy

---
 docs/source/en/_toctree.yml                   |    2 +
 docs/source/en/index.mdx                      |    1 +
 docs/source/en/model_doc/jukebox.mdx          |   92 +
 docs/source/en/serialization.mdx              |    1 +
 src/transformers/__init__.py                  |   18 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    1 +
 .../models/auto/tokenization_auto.py          |    1 +
 src/transformers/models/jukebox/__init__.py   |   62 +
 .../models/jukebox/configuration_jukebox.py   |  475 +++
 .../models/jukebox/convert_jukebox.py         |  137 +
 ...kebox_original_tf_checkpoint_to_pytorch.py |   75 +
 .../models/jukebox/modeling_jukebox.py        | 3374 +++++++++++++++++
 .../models/jukebox/tokenization_jukebox.py    |  346 ++
 .../jukebox/tokenization_jukebox_fast.py      |  147 +
 src/transformers/utils/dummy_pt_objects.py    |   21 +
 .../utils/dummy_sentencepiece_objects.py      |    7 +
 tests/models/jukebox/__init__.py              |    0
 tests/models/jukebox/test_modeling_jukebox.py | 1346 +++++++
 .../jukebox/test_tokenization_jukebox.py      |  211 ++
 21 files changed, 6321 insertions(+)
 create mode 100644 docs/source/en/model_doc/jukebox.mdx
 create mode 100644 src/transformers/models/jukebox/__init__.py
 create mode 100644 src/transformers/models/jukebox/configuration_jukebox.py
 create mode 100644 src/transformers/models/jukebox/convert_jukebox.py
 create mode 100644 src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
 create mode 100755 src/transformers/models/jukebox/modeling_jukebox.py
 create mode 100644 src/transformers/models/jukebox/tokenization_jukebox.py
 create mode 100644 src/transformers/models/jukebox/tokenization_jukebox_fast.py
 create mode 100644 tests/models/jukebox/__init__.py
 create mode 100644 tests/models/jukebox/test_modeling_jukebox.py
 create mode 100644 tests/models/jukebox/test_tokenization_jukebox.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index aaa5f4a37480f..dcd475eb7fbc0 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -252,6 +252,8 @@
       title: I-BERT
     - local: model_doc/imagegpt
       title: ImageGPT
+    - local: model_doc/jukebox
+      title: Jukebox
     - local: model_doc/layoutlm
       title: LayoutLM
     - local: model_doc/layoutlmv2
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 1cfd105c263ee..cd5ffd9f52f5f 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -230,6 +230,7 @@ Flax), PyTorch, and/or TensorFlow.
 |           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Jukebox           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
new file mode 100644
index 0000000000000..8ae0d831810bf
--- /dev/null
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -0,0 +1,92 @@
+---
+language: 
+  - "List of ISO 639-1 code for your language"
+  - en
+  - lang2
+thumbnail: "https://cdn.openai.com/research-covers/jukebox/2x-no-mark.jpg"
+tags:
+- MusicGeneration
+- transformers
+---
+
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Jukebox
+
+## Overview
+
+The Jukebox model was proposed in [Jukebox: A generative model for music](https://arxiv.org/pdf/2005.00341.pdf)
+by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford,
+Ilya Sutskever.
+
+This model proposes a generative music model which can be produce minute long samples which can bne conditionned on
+artist, genre and lyrics.
+
+The abstract from the paper is the following:
+
+We introduce Jukebox, a model that generates
+music with singing in the raw audio domain. We
+tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes,
+and modeling those using autoregressive Transformers. We show that the combined model at
+scale can generate high-fidelity and diverse songs
+with coherence up to multiple minutes. We can
+condition on artist and genre to steer the musical
+and vocal style, and on unaligned lyrics to make
+the singing more controllable. We are releasing
+thousands of non cherry-picked samples, along
+with model weights and code.
+
+Tips:
+
+This model is very slow for now, and takes 18h to generate a minute long audio. 
+
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
+The original code can be found [here](https://github.com/openai/jukebox).
+
+## JukeboxConfig
+
+[[autodoc]] JukeboxConfig
+
+## JukeboxTokenizer
+
+[[autodoc]] JukeboxTokenizer - save_vocabulary
+
+## JukeboxTokenizerFast
+
+[[autodoc]] JukeboxTokenizerFast
+
+## Jukebox specific outputs
+
+[[autodoc]] models.jukebox.modeling_jukebox.JukeboxDoubleHeadsModelOutput
+
+[[autodoc]] models.jukebox.modeling_tf_jukebox.TFJukeboxDoubleHeadsModelOutput
+
+## JukeboxModel
+
+[[autodoc]] JukeboxModel - forward - parallelize - deparallelize
+
+## JukeboxLMHeadModel
+
+[[autodoc]] JukeboxLMHeadModel - forward - parallelize - deparallelize
+
+## JukeboxDoubleHeadsModel
+
+[[autodoc]] JukeboxDoubleHeadsModel - forward
+
+## JukeboxForSequenceClassification
+
+[[autodoc]] JukeboxForSequenceClassification - forward
+
+## JukeboxForTokenClassification
+
+[[autodoc]] JukeboxForTokenClassification - forward
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index c10428f6199ad..cc83b9a2502e5 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -67,6 +67,7 @@ Ready-made configurations include the following architectures:
 - GPT Neo
 - GPT-J
 - I-BERT
+- Jukebox
 - LayoutLM
 - LongT5
 - M2M100
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a68420b127ade..cda1befad22fa 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -220,6 +220,7 @@
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
+    "models.jukebox": ["JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "JukeboxConfig", "JukeboxTokenizer"],
     "models.layoutlm": ["LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMConfig", "LayoutLMTokenizer"],
     "models.layoutlmv2": [
         "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -722,6 +723,7 @@
     _import_structure["modeling_utils"] = ["PreTrainedModel"]
 
     # PyTorch models structure
+
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1218,6 +1220,14 @@
             "load_tf_weights_in_imagegpt",
         ]
     )
+    _import_structure["models.jukebox"].extend(
+        [
+            "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "JukeboxModel",
+            "JukeboxPreTrainedModel",
+            "load_tf_weights_in_jukebox",
+        ]
+    )
     _import_structure["models.layoutlm"].extend(
         [
             "LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2846,6 +2856,7 @@
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
+    from .models.jukebox import JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP, JukeboxConfig, JukeboxTokenizer
     from .models.layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig, LayoutLMTokenizer
     from .models.layoutlmv2 import (
         LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -3064,6 +3075,7 @@
         from .models.cpm import CpmTokenizer
         from .models.deberta_v2 import DebertaV2Tokenizer
         from .models.fnet import FNetTokenizer
+        from .models.jukebox import JukeboxTokenizer
         from .models.layoutxlm import LayoutXLMTokenizer
         from .models.m2m_100 import M2M100Tokenizer
         from .models.marian import MarianTokenizer
@@ -3694,6 +3706,12 @@
             ImageGPTPreTrainedModel,
             load_tf_weights_in_imagegpt,
         )
+        from .models.jukebox import (
+            JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
+            JukeboxModel,
+            JukeboxPreTrainedModel,
+            load_tf_weights_in_jukebox,
+        )
         from .models.layoutlm import (
             LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
             LayoutLMForMaskedLM,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index bb0d7bccb7457..87e2df354a3c2 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -69,6 +69,7 @@
     hubert,
     ibert,
     imagegpt,
+    jukebox,
     layoutlm,
     layoutlmv2,
     layoutlmv3,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index b0f288e365fa6..1a475c9ee3f99 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -72,6 +72,7 @@
         ("hubert", "HubertConfig"),
         ("ibert", "IBertConfig"),
         ("imagegpt", "ImageGPTConfig"),
+        ("jukebox", "JukeboxConfig"),
         ("layoutlm", "LayoutLMConfig"),
         ("layoutlmv2", "LayoutLMv2Config"),
         ("layoutlmv3", "LayoutLMv3Config"),
@@ -188,6 +189,7 @@
         ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("jukebox", "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("layoutlmv3", "LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -305,6 +307,7 @@
         ("hubert", "Hubert"),
         ("ibert", "I-BERT"),
         ("imagegpt", "ImageGPT"),
+        ("jukebox", "Jukebox"),
         ("layoutlm", "LayoutLM"),
         ("layoutlmv2", "LayoutLMv2"),
         ("layoutlmv3", "LayoutLMv3"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 528f5022852b2..3305f38db2121 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -71,6 +71,7 @@
         ("hubert", "HubertModel"),
         ("ibert", "IBertModel"),
         ("imagegpt", "ImageGPTModel"),
+        ("jukebox", "JukeboxModel"),
         ("layoutlm", "LayoutLMModel"),
         ("layoutlmv2", "LayoutLMv2Model"),
         ("layoutlmv3", "LayoutLMv3Model"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 76a6dee790d6f..38e4ca4491639 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -131,6 +131,7 @@
             ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
             ("hubert", ("Wav2Vec2CTCTokenizer", None)),
             ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+            ("jukebox", ("JukeboxTokenizer", None)),
             ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
             ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
             ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
new file mode 100644
index 0000000000000..4075d104e72c4
--- /dev/null
+++ b/src/transformers/models/jukebox/__init__.py
@@ -0,0 +1,62 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_jukebox": ["JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "JukeboxConfig"],
+    "tokenization_jukebox": ["JukeboxTokenizer"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_jukebox"] = [
+        "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "JukeboxModel",
+        "JukeboxPreTrainedModel",
+        "load_tf_weights_in_jukebox",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_jukebox import JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP, JukeboxConfig
+    from .tokenization_jukebox import JukeboxTokenizer
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_jukebox import (
+            JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
+            JukeboxModel,
+            JukeboxPreTrainedModel,
+            load_tf_weights_in_jukebox,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
new file mode 100644
index 0000000000000..ef7ec0f2201eb
--- /dev/null
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -0,0 +1,475 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Jukebox configuration"""
+from collections import OrderedDict
+from typing import Any, List, Mapping, Optional
+
+from transformers import PreTrainedTokenizer, TensorType, is_torch_available
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfigWithPast, PatchingSpec
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ArthurZ/jukebox-dummy": "https://huggingface.co/ArthurZ/jukebox-dummy/resolve/main/config.json",
+    "ArthurZ/jukebox-1h-lyrics": "https://huggingface.co/ArthurZ/jukebox-1b-lyrics/resolve/main/config.json",
+}
+
+
+class JukeboxConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`JukeboxModel`] or a [`TFJukeboxModel`]. It is
+    used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the GPT-2
+    [small](https://huggingface.co/jukebox) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    The downsampling and stride are used to determine downsampling of the input sequence. For example, downsamoling =
+    (5,3), and strides = (2, 2) will downsample the audio by 2**5 = 32 to get the first level of codes, and 2**8 = 256
+    to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`JukeboxModel`] or [`TFJukeboxModel`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`JukeboxDoubleHeadsModel`] and
+            [`TFJukeboxDoubleHeadsModel`].
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`JukeboxDoubleHeadsModel`] and
+            [`TFJukeboxDoubleHeadsModel`].
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            [`JukeboxDoubleHeadsModel`].
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`JukeboxDoubleHeadsModel`] and
+            [`TFJukeboxDoubleHeadsModel`].
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`JukeboxDoubleHeadsModel`] and
+            [`TFJukeboxDoubleHeadsModel`].
+
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+
+    Example:
+
+    ```python
+    >>> from transformers import JukeboxModel, JukeboxConfig
+
+    >>> # Initializing a Jukebox configuration
+    >>> configuration = JukeboxConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = JukeboxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "jukebox"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    # params are given for the `n` priors at the same time which means that you have
+    # level2,level1,level0
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        emb_dropout=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        # Global paranmeters
+        sr=16000,
+        sample_length=None,
+        sample_length_in_seconds=1,
+        y_bins=[(120, 4111), (120, 4111), (120, 4111)],
+        use_nonrelative_specloss=True,
+        copy_input=False,
+        resid_dropout=0.0,
+        # MLP parameters
+        mlp_init_scale=0.02,
+        # Attention layer parameters
+        attn_dropout=0.0,
+        attn_init_scale=1.0,
+        # transformer parameters
+        activation_function="gelu_new",
+        sample_hop_length=30000,
+        hop_length=256,
+        multispec_loss_n_fft=(2048, 1024, 512),
+        multispec_loss_hop_length=(240, 120, 50),
+        multispec_loss_window_size=(1200, 600, 240),
+        vq_vae_levels=3,
+        vq_vae_downs_t=(3, 2, 2),
+        vq_vae_strides_t=(2, 2, 2),
+        vq_vae_emmbedding_width=2048,
+        vq_vae_codebook_dimension=2048,
+        vq_vae_width=64,
+        vq_vae_depth=4,
+        vq_vae_m_conv=1,
+        vq_vae_dilation_growth_rate=3,
+        vq_vae_dilation_cycle=None,
+        vq_vae_multipliers=(2, 1, 1),
+        vq_vae_lmu=0.99,  # for the ema?
+        vq_vae_commit=0.02,
+        vq_vae_conv_block_depth=4,
+        vq_vae_conv_block_width=64,
+        spectral=0.0,
+        multispectral=1.0,
+        # vq_vae_loss_fn = 'l1',
+        vq_vae_reverse_decoder_dilation=1,
+        # parameters always false/useless at inference
+        nb_priors=3,
+        spread=None,
+        prime_spread=None,
+        zero_out=False,
+        res_scale=False,
+        pos_init=False,
+        cond_zero_out=False,
+        # args for the priors, 3 priors
+        n_ctx=(8192, 8192, 8192),
+        t_bins=128,
+        downs_t=(3, 2, 2),
+        strides_t=(2, 2, 2),
+        single_enc_dec=[True, False, False],
+        labels=False,
+        merged_decoder=[True, False, False],
+        priors_width=[4096, 2048, 1024],
+        l_bins=256,
+        width=[4800, 1920, 128],
+        depth=[79, 72, 72],
+        n_heads=[8, 1, 1],
+        use_tokens=[True, False, False],
+        n_tokens=[512, 0, 0],
+        attn_order=[10, 2, 2],
+        blocks=16,
+        c_res=1,
+        init_scale=[0.7, 1, 1],
+        cond_depth=[3, 16, 16],
+        cond_width=[128, 1024, 1024],
+        cond_dilation_growth_rate=[1, 3, 3],
+        cond_dilation_cycle=[None, 8, 8],
+        cond_c_res=[0, 1, 1],
+        cond_res_scale=False,
+        prime_width=[128, 128, 128],
+        prime_depth=[18, 3, 3],
+        prime_cond_c_res=[0, 1, 1],
+        prime_heads=4,
+        prime_m_attn=0.25,
+        prime_m_mlp=1.0,
+        prime_blocks=32,
+        prime_init_scale=[0.1, 0.4, 0.4],
+        prime_c_res=1,
+        prime_loss_fraction=[0.4, 0.0, 0.0],
+        prime_attn_order=[2, 0, 0],
+        prime_attn_dropout=0.0,
+        prime_resid_dropout=0.0,
+        prime_emb_dropout=0.0,
+        prime_zero_out=False,
+        prime_res_scale=False,
+        prime_pos_init=False,
+        min_duration=1,
+        max_duration=600.0,
+        fp16_params=True,
+        alignment_layer=[68, None, None],
+        alignment_head=[2, None, None],
+        m_attn=0.25,
+        n_vocab=80,
+        cond_m_conv=1,
+        max_bow_genre_size=1,  # this should only be in the tokenizer
+        name="AudioSamples",
+        **kwargs,
+    ):
+        self.name = name
+        self.prime_zero_out = prime_zero_out
+        self.prime_res_scale = prime_res_scale
+        self.prime_pos_init = prime_pos_init
+        self.prime_resid_dropout = prime_resid_dropout
+        self.prime_attn_dropout = prime_attn_dropout
+        self.prime_m_mlp = prime_m_mlp
+        self.prime_m_attn = prime_m_attn
+        self.prime_emb_dropout = prime_emb_dropout
+        self.prime_attn_order = prime_attn_order
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_dropout = resid_dropout
+        self.emb_dropout = emb_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.max_bow_genre_size = max_bow_genre_size
+        self.cond_m_conv = cond_m_conv
+        self.n_vocab = n_vocab
+        self.sr = sr
+        self.sample_length = sample_length
+        self.sample_length_in_seconds = sample_length_in_seconds
+        self.y_bins = y_bins
+        self.use_nonrelative_specloss = use_nonrelative_specloss
+        self.copy_input = copy_input
+        self.resid_dropout = resid_dropout
+        self.mlp_init_scale = mlp_init_scale
+        self.attn_dropout = attn_dropout
+        self.attn_init_scale = attn_init_scale
+
+        self.activation_function = activation_function
+        self.sample_hop_length = sample_hop_length
+        self.hop_length = hop_length
+        self.multispec_loss_n_fft = multispec_loss_n_fft
+
+        self.multispec_loss_hop_length = multispec_loss_hop_length
+
+        self.multispec_loss_window_size = multispec_loss_window_size
+
+        self.vq_vae_levels = vq_vae_levels
+        self.vq_vae_downs_t = vq_vae_downs_t
+
+        self.vq_vae_strides_t = vq_vae_strides_t
+
+        self.vq_vae_emmbedding_width = vq_vae_emmbedding_width
+        self.vq_vae_codebook_dimension = vq_vae_codebook_dimension
+        self.vq_vae_width = vq_vae_width
+        self.vq_vae_depth = vq_vae_depth
+        self.vq_vae_m_conv = vq_vae_m_conv
+        self.vq_vae_dilation_growth_rate = vq_vae_dilation_growth_rate
+        self.vq_vae_dilation_cycle = vq_vae_dilation_cycle
+        self.vq_vae_multipliers = vq_vae_multipliers
+
+        self.vq_vae_lmu = vq_vae_lmu
+
+        self.vq_vae_commit = vq_vae_commit
+        self.spectral = spectral
+        self.multispectral = multispectral
+
+        self.vq_vae_conv_block_depth = vq_vae_conv_block_depth
+        self.vq_vae_conv_block_width = vq_vae_conv_block_width
+        self.vq_vae_reverse_decoder_dilation = vq_vae_reverse_decoder_dilation
+
+        self.nb_priors = nb_priors
+        self.spread = spread
+        self.prime_spread = prime_spread
+        self.zero_out = zero_out
+        self.res_scale = res_scale
+        self.pos_init = pos_init
+        self.cond_zero_out = cond_zero_out
+        self.n_ctx = n_ctx
+        self.t_bins = t_bins
+        self.l_bins = l_bins
+        self.downs_t = downs_t
+        self.strides_t = strides_t
+        self.single_enc_dec = single_enc_dec
+        self.labels = labels
+        self.merged_decoder = merged_decoder
+        self.priors_width = priors_width
+        self.width = width
+        self.depth = depth
+        self.n_heads = n_heads
+        self.use_tokens = use_tokens
+        self.n_tokens = n_tokens
+        self.attn_order = attn_order
+        self.blocks = blocks
+        self.c_res = c_res
+        self.init_scale = init_scale
+        self.prime_width = prime_width
+        self.prime_depth = prime_depth
+        self.cond_depth = cond_depth
+        self.cond_width = cond_width
+        self.cond_dilation_growth_rate = cond_dilation_growth_rate
+        self.cond_dilation_cycle = cond_dilation_cycle
+        self.cond_c_res = cond_c_res
+        self.cond_res_scale = cond_res_scale
+        self.prime_cond_c_res = prime_cond_c_res
+        self.prime_heads = prime_heads
+        self.prime_attn_order = prime_attn_order
+        self.prime_blocks = prime_blocks
+        self.prime_init_scale = prime_init_scale
+        self.prime_c_res = prime_c_res
+        self.prime_loss_fraction = prime_loss_fraction
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.fp16_params = fp16_params
+        self.alignment_layer = alignment_layer
+        self.alignment_head = alignment_head
+        self.m_attn = m_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+class JukeboxOnnxConfig(OnnxConfigWithPast):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        task: str = "default",
+        patching_specs: List[PatchingSpec] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
+        if not getattr(self._config, "pad_token_id", None):
+            # TODO: how to do that better?
+            self._config.pad_token_id = 0
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
+        else:
+            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
+
+        return common_inputs
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.n_layer
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.n_head
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+                batch, seqlen = common_inputs["input_ids"].shape
+                # Not using the same length for past_key_values
+                past_key_values_length = seqlen + 2
+                past_shape = (
+                    batch,
+                    self.num_attention_heads,
+                    past_key_values_length,
+                    self._config.hidden_size // self.num_attention_heads,
+                )
+                ordered_inputs["past_key_values"] = [
+                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
+                ]
+
+        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
+        if self.use_past:
+            ordered_inputs["attention_mask"] = torch.cat(
+                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length)], dim=1
+            )
+
+        return ordered_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
new file mode 100644
index 0000000000000..457add5d574ed
--- /dev/null
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViT checkpoints trained with the DINO method."""
+
+
+import argparse
+from pathlib import Path
+
+import torch
+
+import requests
+from transformers import JukeboxConfig, JukeboxModel
+from transformers.models import jukebox
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+PREFIX = "https://openaipublic.azureedge.net/jukebox/models/"
+MODEL_MAPPING = {
+    "jukebox-1b-lyrics": [
+        "5b/vqvae.pth.tar",
+        "5b/prior_level_0.pth.tar",
+        "5b/prior_level_1.pth.tar",
+        "1b_lyrics/prior_level_2.pth.tar",
+    ],
+    "jukebox-5b": [
+        "5b/vqvae.pth.tar5b/prior_level_0.pth.tar",
+        "5b/prior_level_1.pth.tar",
+        "5b/prior_level_2.pth.tar",
+    ],
+    "jukebox-5b-lyrics": [
+        "5b/vqvae.pth.tar5b/prior_level_0.pth.tar",
+        "5b/prior_level_1.pth.tar",
+        "5b_lyrics/prior_level_2.pth.tar",
+    ],
+}
+
+
+@torch.no_grad()
+def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
+    """
+    Copy/paste/tweak model's weights to our Jukebox structure.
+    """
+    for file in MODEL_MAPPING[model_name]:
+        r = requests.get(f"{PREFIX}{file}", allow_redirects=True)
+        open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content)
+
+    vqvae, *priors = MODEL_MAPPING[model_name.split("/")[-1]]
+    vqvae_dic = torch.load(f"{pytorch_dump_folder_path}/{vqvae.split('/')[-1]}", map_location=torch.device("cpu"))[
+        "model"
+    ]
+
+    weight_dict = []
+    for dict_name in priors:
+        old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}")["model"]
+        new_dic = {}
+        for k in old_dic.keys():
+            if k.endswith(".b"):
+                new_dic[k.replace("b", "bias")] = old_dic[k]
+            elif k.endswith(".w"):
+                new_dic[k.replace("w", "weight")] = old_dic[k]
+            elif "level_2" not in dict_name and "cond.model." in k:
+                new_dic[k.replace(".blocks.", ".model.")] = old_dic[k]
+            else:
+                new_dic[k] = old_dic[k]
+        weight_dict.append(new_dic)
+
+    config = JukeboxConfig.from_pretrained(model_name)
+    model = JukeboxModel(config)
+
+    model.vqvae.load_state_dict(vqvae_dic)
+    for i in range(len(weight_dict)):
+        model.priors[i].load_state_dict(weight_dict[i])
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path, save_config=False)
+
+    return weight_dict
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="jukebox-1b-lyrics",
+        type=str,
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    args = parser.parse_args()
+    convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
+
+
+# previous code to convert dummy :
+# weight_dict = []
+# vqvae_dic = torch.load("/Users/arthur/Work/HuggingFace/jukebox/porting/vqvae.pth")
+# weight_dict.append(vqvae_dic)
+
+# for dict_name in ["up0", "up1", "up2"]:
+#     old_dic = torch.load(f"/Users/arthur/Work/HuggingFace/jukebox/porting/{dict_name}.pth")
+#     new_dic = {}
+#     for k in old_dic.keys():
+#         if k.endswith(".b"):
+#             new_dic[k.replace("b", "bias")] = old_dic[k]
+#         elif k.endswith(".w"):
+#             new_dic[k.replace("w", "weight")] = old_dic[k]
+#         elif dict_name != "up2" and "cond.model." in k:
+#             new_dic[k.replace(".blocks.", ".model.")] = old_dic[k]
+#         else:
+#             new_dic[k] = old_dic[k]
+#     weight_dict.append(new_dic)
+
+# return weight_dict
diff --git a/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000..8b31282f253be
--- /dev/null
+++ b/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+
+import argparse
+
+import torch
+
+from transformers import JukeboxConfig, JukeboxModel, load_tf_weights_in_jukebox
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
+
+
+logging.set_verbosity_info()
+
+
+def convert_jukebox_checkpoint_to_pytorch(jukebox_checkpoint_path, jukebox_config_file, pytorch_dump_folder_path):
+    # Construct model
+    if jukebox_config_file == "":
+        config = JukeboxConfig()
+    else:
+        config = JukeboxConfig.from_json_file(jukebox_config_file)
+    model = JukeboxModel(config)
+
+    # Load weights from numpy
+    load_tf_weights_in_jukebox(model, config, jukebox_checkpoint_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
+    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print(f"Save configuration file to {pytorch_config_dump_path}")
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--jukebox_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the TensorFlow checkpoint path.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--jukebox_config_file",
+        default="",
+        type=str,
+        help=(
+            "An optional config json file corresponding to the pre-trained OpenAI model. \n"
+            "This specifies the model architecture."
+        ),
+    )
+    args = parser.parse_args()
+    convert_jukebox_checkpoint_to_pytorch(
+        args.jukebox_checkpoint_path, args.jukebox_config_file, args.pytorch_dump_folder_path
+    )
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
new file mode 100755
index 0000000000000..ba402da1a253f
--- /dev/null
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -0,0 +1,3374 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Jukebox model."""
+
+import math
+import os
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+
+
+if version.parse(torch.__version__) >= version.parse("1.6"):
+    is_amp_available = True
+    # from torch.cuda.amp import autocast
+else:
+    is_amp_available = False
+
+import gc
+
+from ...activations import ACT2FN
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, logging
+from .configuration_jukebox import JukeboxConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "ArthurZ/jukebox-dummy"
+_CONFIG_FOR_DOC = "JukeboxConfig"
+_TOKENIZER_FOR_DOC = "JukeboxTokenizer"
+
+JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "ArthurZ/jukebox-dummy",
+    # See all Jukebox models at https://huggingface.co/models?filter=jukebox
+]
+
+
+def empty_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+####################################################################
+# Attention and scalable transformer
+# Import FusedLayerNorm if we have apex, otherwise use regular LayerNorm
+try:
+    from apex.normalization import FusedLayerNorm
+
+    print("Using apex FusedLayerNorm")
+except ImportError:
+    from torch.nn import LayerNorm as FusedLayerNorm
+
+
+# VQ-VAE building blocks
+class Conv1D(nn.Module):
+    def __init__(self, n_in, n_out, zero_out=False, init_scale=1.0):
+        super(Conv1D, self).__init__()
+        self.n_in = n_in
+        self.n_out = n_out
+        if zero_out:
+            w = torch.zeros(n_in, n_out)
+        else:
+            w = torch.empty(n_in, n_out)
+            nn.init.normal_(w, std=0.02 * init_scale)
+        b = torch.zeros(n_out)
+        self.weight = nn.Parameter(w)  # modified self.w
+        self.bias = nn.Parameter(b)
+
+    def forward(self, x):
+        size_out = (*x.size()[:-1], self.n_out)
+        x = torch.addmm(
+            self.bias.type_as(x), x.view(-1, x.size(-1)), self.weight.type_as(x)
+        )  # If x if float then float else half
+        x = x.view(*size_out)
+        return x
+
+
+class ResConvBlock(nn.Module):
+    def __init__(self, n_in, n_state):
+        super().__init__()
+        self.model = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv2d(n_in, n_state, 3, 1, 1),
+            nn.ReLU(),
+            nn.Conv2d(n_state, n_in, 1, 1, 0),
+        )
+
+    def forward(self, x):
+        return x + self.model(x)
+
+
+class Resnet(nn.Module):
+    def __init__(self, n_in, n_depth, m_conv=1.0):
+        super().__init__()
+        self.model = nn.Sequential(*[ResConvBlock(n_in, int(m_conv * n_in)) for _ in range(n_depth)])
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class ResConv1DBlock(nn.Module):
+    def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scale=1.0):
+        super().__init__()
+        padding = dilation
+        self.model = nn.Sequential(
+            nn.ReLU(),
+            nn.Conv1d(n_in, n_state, 3, 1, padding, dilation),
+            nn.ReLU(),
+            nn.Conv1d(n_state, n_in, 1, 1, 0),
+        )
+        if zero_out:
+            out = self.model[-1]
+            nn.init.zeros_(out.weight)
+            nn.init.zeros_(out.bias)
+        self.res_scale = res_scale
+
+    def forward(self, x):
+        return x + self.res_scale * self.model(x)
+
+
+class Resnet1D(nn.Module):
+    def __init__(
+        self,
+        n_in,
+        n_depth,
+        m_conv=1.0,
+        dilation_growth_rate=1,
+        dilation_cycle=None,
+        zero_out=False,
+        res_scale=False,
+        reverse_dilation=False,
+        checkpoint_res=False,
+    ):
+        super().__init__()
+
+        def _get_depth(depth):
+            if dilation_cycle is None:
+                return depth
+            else:
+                return depth % dilation_cycle
+
+        blocks = [
+            ResConv1DBlock(
+                n_in,
+                int(m_conv * n_in),
+                dilation=dilation_growth_rate ** _get_depth(depth),
+                zero_out=zero_out,
+                res_scale=1.0 if not res_scale else 1.0 / math.sqrt(n_depth),
+            )
+            for depth in range(n_depth)
+        ]
+        if reverse_dilation:
+            blocks = blocks[::-1]
+        self.checkpoint_res = checkpoint_res
+        # if self.checkpoint_res == 1:
+        #     # if dist.get_rank() == 0:
+        #     #     print("Checkpointing convs")
+        #     self.blocks = nn.ModuleList(blocks)
+        # else:
+        #    self.model = nn.Sequential(*blocks)
+        self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        # if self.checkpoint_res == 1:
+        #     for block in self.blocks:
+        #         x = checkpoint(block, (x, ), block.parameters(), True)
+        #     return x
+        # else:
+        #     return self.model(x)
+        return self.model(x)
+
+
+class EncoderConvBlock(nn.Module):
+    def __init__(
+        self,
+        input_emb_width,
+        output_emb_width,
+        down_t,
+        stride_t,
+        width,
+        depth,
+        m_conv,
+        dilation_growth_rate=1,
+        dilation_cycle=None,
+        zero_out=False,
+        res_scale=False,
+    ):
+        super().__init__()
+        blocks = []
+        filter_t, pad_t = stride_t * 2, stride_t // 2
+        if down_t > 0:
+            for i in range(down_t):
+                block = nn.Sequential(
+                    nn.Conv1d(input_emb_width if i == 0 else width, width, filter_t, stride_t, pad_t),
+                    Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out, res_scale),
+                )
+                blocks.append(block)
+            block = nn.Conv1d(width, output_emb_width, 3, 1, 1)
+            blocks.append(block)
+        self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class DecoderConvBock(nn.Module):
+    def __init__(
+        self,
+        input_emb_width,
+        output_emb_width,
+        down_t,
+        stride_t,
+        width,
+        depth,
+        m_conv,
+        dilation_growth_rate=1,
+        dilation_cycle=None,
+        zero_out=False,
+        res_scale=False,
+        reverse_decoder_dilation=False,
+        checkpoint_res=False,
+    ):
+        super().__init__()
+        blocks = []
+        if down_t > 0:
+            filter_t, pad_t = stride_t * 2, stride_t // 2
+            block = nn.Conv1d(output_emb_width, width, 3, 1, 1)
+            blocks.append(block)
+            for i in range(down_t):
+                block = nn.Sequential(
+                    Resnet1D(
+                        width,
+                        depth,
+                        m_conv,
+                        dilation_growth_rate,
+                        dilation_cycle,
+                        zero_out=zero_out,
+                        res_scale=res_scale,
+                        reverse_dilation=reverse_decoder_dilation,
+                        checkpoint_res=checkpoint_res,
+                    ),
+                    nn.ConvTranspose1d(
+                        width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t
+                    ),
+                )
+                blocks.append(block)
+        self.model = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class Encoder(nn.Module):
+    def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t, **block_kwargs):
+        super().__init__()
+        self.input_emb_width = input_emb_width
+        self.output_emb_width = output_emb_width
+        self.levels = levels
+        self.downs_t = downs_t
+        self.strides_t = strides_t
+
+        block_kwargs_copy = dict(**block_kwargs)
+        if "reverse_decoder_dilation" in block_kwargs_copy:
+            del block_kwargs_copy["reverse_decoder_dilation"]
+
+        def level_block(level, down_t, stride_t):
+            return EncoderConvBlock(
+                input_emb_width if level == 0 else output_emb_width,
+                output_emb_width,
+                down_t,
+                stride_t,
+                **block_kwargs_copy,
+            )
+
+        self.level_blocks = nn.ModuleList()
+        iterator = zip(list(range(self.levels)), downs_t, strides_t)
+        for level, down_t, stride_t in iterator:
+            self.level_blocks.append(level_block(level, down_t, stride_t))
+
+    def forward(self, x):
+        # N, T = x.shape[0], x.shape[-1]
+        # emb = self.input_emb_width
+        xs = []
+
+        # 64, 32, ...
+        iterator = zip(list(range(self.levels)), self.downs_t, self.strides_t)
+        for level, down_t, stride_t in iterator:
+            level_block = self.level_blocks[-level - 1]
+            x = level_block(x)
+            # emb, T = self.output_emb_width, T // (stride_t**down_t)
+            # assert_shape(x, (N, emb, T))
+            xs.append(x)
+
+        return xs
+
+
+class Decoder(nn.Module):
+    def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t, **block_kwargs):
+        super().__init__()
+        self.input_emb_width = input_emb_width
+        self.output_emb_width = output_emb_width
+        self.levels = levels
+
+        self.downs_t = downs_t
+
+        self.strides_t = strides_t
+
+        def level_block(level, down_t, stride_t):
+            return DecoderConvBock(output_emb_width, output_emb_width, down_t, stride_t, **block_kwargs)
+
+        self.level_blocks = nn.ModuleList()
+        iterator = zip(list(range(self.levels)), downs_t, strides_t)
+        for level, down_t, stride_t in iterator:
+            self.level_blocks.append(level_block(level, down_t, stride_t))
+
+        self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
+
+    def forward(self, xs, all_levels=True):
+        if all_levels:
+            assert len(xs) == self.levels
+        else:
+            assert len(xs) == 1
+        x = xs[-1]
+        _, T = x.shape[0], x.shape[-1]
+        # emb = self.output_emb_width
+        # assert_shape(x, (N, emb, T))
+
+        # 32, 64 ...
+        iterator = reversed(list(zip(list(range(self.levels)), self.downs_t, self.strides_t)))
+        for level, down_t, stride_t in iterator:
+            level_block = self.level_blocks[level]
+            x = level_block(x)
+            _, T = self.output_emb_width, T * (stride_t**down_t)
+            # assert_shape(x, (N, emb, T))
+            if level != 0 and all_levels:
+                x = x + xs[level - 1]
+
+        x = self.out(x)
+        return x
+
+
+def dont_update(params):
+    for param in params:
+        param.requires_grad = False
+
+
+def update(params):
+    for param in params:
+        param.requires_grad = True
+
+
+def calculate_strides(strides, downs):
+    return [stride**down for stride, down in zip(strides, downs)]
+
+
+def _loss_fn(loss_fn, x_target, x_pred, hps):
+    if loss_fn == "l1":
+        return torch.mean(torch.abs(x_pred - x_target)) / hps.bandwidth["l1"]
+    elif loss_fn == "l2":
+        return torch.mean((x_pred - x_target) ** 2) / hps.bandwidth["l2"]
+    elif loss_fn == "linf":
+        residual = ((x_pred - x_target) ** 2).reshape(x_target.shape[0], -1)
+        values, _ = torch.topk(residual, hps.linf_k, dim=1)
+        return torch.mean(values) / hps.bandwidth["l2"]
+    elif loss_fn == "lmix":
+        loss = 0.0
+        if hps.lmix_l1:
+            loss += hps.lmix_l1 * _loss_fn("l1", x_target, x_pred, hps)
+        if hps.lmix_l2:
+            loss += hps.lmix_l2 * _loss_fn("l2", x_target, x_pred, hps)
+        if hps.lmix_linf:
+            loss += hps.lmix_linf * _loss_fn("linf", x_target, x_pred, hps)
+        return loss
+    else:
+        assert False, f"Unknown loss_fn {loss_fn}"
+
+
+class BottleneckBlock(nn.Module):
+    def __init__(self, k_bins, emb_width, mu):
+        super().__init__()
+        self.k_bins = k_bins
+        self.emb_width = emb_width
+        self.mu = mu
+        self.reset_k()
+        self.threshold = 1.0
+
+    def reset_k(self):
+        self.init = False
+        self.k_sum = None
+        self.k_elem = None
+        # self.register_buffer('k',  torch.zeros(self.k_bins, self.emb_width).cuda())
+
+        if torch.cuda.is_available():
+            self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width).to("cuda"))
+        else:
+            self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width))
+
+    def _tile(self, x):
+        d, ew = x.shape
+        if d < self.k_bins:
+            n_repeats = (self.k_bins + d - 1) // d
+            std = 0.01 / np.sqrt(ew)
+            x = x.repeat(n_repeats, 1)
+            x = x + torch.randn_like(x) * std
+        return x
+
+    def init_k(self, x):
+        _, emb_width, k_bins = self.mu, self.emb_width, self.k_bins  # mu,
+        self.init = True
+        # init k_w using random vectors from x
+        y = self._tile(x)
+        _k_rand = y[torch.randperm(y.shape[0])][:k_bins]
+        # dist.broadcast(_k_rand, 0)
+        self.k = _k_rand
+        assert self.k.shape == (k_bins, emb_width)
+        self.k_sum = self.k
+        self.k_elem = torch.ones(k_bins, device=self.k.device)
+
+    def restore_k(self, num_tokens=None, threshold=1.0):
+        _, emb_width, k_bins = self.mu, self.emb_width, self.k_bins  # mu -> _
+        self.init = True
+        assert self.k.shape == (k_bins, emb_width)
+        self.k_sum = self.k.clone()
+        self.k_elem = torch.ones(k_bins, device=self.k.device)
+        if num_tokens is not None:
+            expected_usage = num_tokens / k_bins
+            self.k_elem.data.mul_(expected_usage)
+            self.k_sum.data.mul_(expected_usage)
+        self.threshold = threshold
+
+    def update_k(self, x, x_l):
+        mu, emb_width, k_bins = self.mu, self.emb_width, self.k_bins
+        with torch.no_grad():
+            # Calculate new centres
+            x_l_onehot = torch.zeros(k_bins, x.shape[0], device=x.device)  # k_bins, N * L
+            x_l_onehot.scatter_(0, x_l.view(1, x.shape[0]), 1)
+
+            _k_sum = torch.matmul(x_l_onehot, x)  # k_bins, w
+            _k_elem = x_l_onehot.sum(dim=-1)  # k_bins
+            y = self._tile(x)
+            _k_rand = y[torch.randperm(y.shape[0])][:k_bins]
+
+            # dist.broadcast(_k_rand, 0)
+            # dist.all_reduce(_k_sum)
+            # dist.all_reduce(_k_elem)
+
+            # Update centres
+            old_k = self.k
+            self.k_sum = mu * self.k_sum + (1.0 - mu) * _k_sum  # w, k_bins
+            self.k_elem = mu * self.k_elem + (1.0 - mu) * _k_elem  # k_bins
+            usage = (self.k_elem.view(k_bins, 1) >= self.threshold).float()
+            self.k = usage * (self.k_sum.view(k_bins, emb_width) / self.k_elem.view(k_bins, 1)) + (1 - usage) * _k_rand
+            _k_prob = _k_elem / torch.sum(_k_elem)  # x_l_onehot.mean(dim=-1)  # prob of each bin
+            entropy = -torch.sum(_k_prob * torch.log(_k_prob + 1e-8))  # entropy ie how diverse
+            used_curr = (_k_elem >= self.threshold).sum()
+            usage = torch.sum(usage)
+            dk = torch.norm(self.k - old_k) / np.sqrt(np.prod(old_k.shape))
+        return dict(entropy=entropy, used_curr=used_curr, usage=usage, dk=dk)
+
+    def preprocess(self, x):
+        # NCT -> NTC -> [NT, C]
+        x = x.permute(0, 2, 1).contiguous()
+        x = x.view(-1, x.shape[-1])  # x_en = (N * L, w), k_j = (w, k_bins)
+
+        if x.shape[-1] == self.emb_width:
+            prenorm = torch.norm(x - torch.mean(x)) / np.sqrt(np.prod(x.shape))
+        elif x.shape[-1] == 2 * self.emb_width:
+            x1, x2 = x[..., : self.emb_width], x[..., self.emb_width :]
+            prenorm = (torch.norm(x1 - torch.mean(x1)) / np.sqrt(np.prod(x1.shape))) + (
+                torch.norm(x2 - torch.mean(x2)) / np.sqrt(np.prod(x2.shape))
+            )
+
+            # Normalise
+            x = x1 + x2
+        else:
+            assert False, f"Expected {x.shape[-1]} to be (1 or 2) * {self.emb_width}"
+        return x, prenorm
+
+    def postprocess(self, x_l, x_d, x_shape):
+        # [NT, C] -> NTC -> NCT
+        N, T = x_shape
+        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()
+        x_l = x_l.view(N, T)
+        return x_l, x_d
+
+    def quantise(self, x):
+        # Calculate latent code x_l
+        k_w = self.k.t()
+        distance = (
+            torch.sum(x**2, dim=-1, keepdim=True)
+            - 2 * torch.matmul(x, k_w)
+            + torch.sum(k_w**2, dim=0, keepdim=True)
+        )  # (N * L, b)
+        min_distance, x_l = torch.min(distance, dim=-1)
+        fit = torch.mean(min_distance)
+        return x_l, fit
+
+    def dequantise(self, x_l):
+        x = F.embedding(x_l, self.k)
+        return x
+
+    def encode(self, x):
+        N, width, T = x.shape
+
+        # Preprocess.
+        x, prenorm = self.preprocess(x)
+
+        # Quantise
+        x_l, fit = self.quantise(x)
+
+        # Postprocess.
+        x_l = x_l.view(N, T)
+        return x_l
+
+    def decode(self, x_l):
+        N, T = x_l.shape
+        width = self.emb_width
+
+        # Dequantise
+        x_d = self.dequantise(x_l)
+
+        # Postprocess
+        x_d = x_d.view(N, T, width).permute(0, 2, 1).contiguous()
+        return x_d
+
+    def forward(self, x, update_k=True):
+        N, width, T = x.shape
+
+        # Preprocess
+        x, prenorm = self.preprocess(x)
+
+        # Init k if not inited
+        if update_k and not self.init:
+            self.init_k(x)
+
+        # Quantise and dequantise through bottleneck
+        x_l, fit = self.quantise(x)
+        x_d = self.dequantise(x_l)
+
+        # Update embeddings
+        if update_k:
+            update_metrics = self.update_k(x, x_l)
+        else:
+            update_metrics = {}
+
+        # Loss
+        commit_loss = torch.norm(x_d.detach() - x) ** 2 / np.prod(x.shape)
+
+        # Passthrough
+        x_d = x + (x_d - x).detach()
+
+        # Postprocess
+        x_l, x_d = self.postprocess(x_l, x_d, (N, T))
+        return x_l, x_d, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, l_bins, emb_width, mu, levels):
+        super().__init__()
+        self.levels = levels
+
+        def level_block(level):
+            return BottleneckBlock(l_bins, emb_width, mu)
+
+        self.level_blocks = nn.ModuleList()
+        for level in range(self.levels):
+            self.level_blocks.append(level_block(level))
+
+    def encode(self, xs):
+        zs = [level_block.encode(x) for (level_block, x) in zip(self.level_blocks, xs)]
+        return zs
+
+    def decode(self, zs, start_level=0, end_level=None):
+        if end_level is None:
+            end_level = self.levels
+        xs_quantised = [
+            level_block.decode(z) for (level_block, z) in zip(self.level_blocks[start_level:end_level], zs)
+        ]
+        return xs_quantised
+
+    def forward(self, xs):
+        zs, xs_quantised, commit_losses, metrics = [], [], [], []
+        for level in range(self.levels):
+            level_block = self.level_blocks[-level - 1]
+            x = xs[level]
+            z, x_quantised, commit_loss, metric = level_block(x, update_k=self.training)
+            zs.append(z)
+            if not self.training:
+                # Be extra paranoid and make sure the encoder weights can't
+                # change from straight-through estimator
+                x_quantised = x_quantised.detach()
+            xs_quantised.append(x_quantised)
+            commit_losses.append(commit_loss)
+            if self.training:
+                metrics.append(metric)
+        return zs, xs_quantised, commit_losses, metrics
+
+
+def stft(sig, hps):
+    return torch.stft(
+        sig,
+        hps.n_fft,
+        hps.hop_length,
+        win_length=hps.window_size,
+        window=torch.hann_window(hps.window_size, device=sig.device),
+    )
+
+
+def spec(x, hps):
+    return torch.norm(stft(x, hps), p=2, dim=-1)
+
+
+class DefaultSTFTValues:
+    def __init__(self, hps):
+        self.sr = hps.sr
+        self.n_fft = 2048
+        self.hop_length = 256
+        self.window_size = 6 * self.hop_length
+
+
+def norm(x):
+    return (x.view(x.shape[0], -1) ** 2).sum(dim=-1).sqrt()
+
+
+def squeeze(x):
+    if len(x.shape) == 3:
+        assert x.shape[-1] in [1, 2]
+        x = torch.mean(x, -1)
+    if len(x.shape) != 2:
+        raise ValueError(f"Unknown input shape {x.shape}")
+    return x
+
+
+def spectral_loss(x_in, x_out, hps):
+    hps = DefaultSTFTValues(hps)
+    spec_in = spec(squeeze(x_in.float()), hps)
+    spec_out = spec(squeeze(x_out.float()), hps)
+    return norm(spec_in - spec_out)
+
+
+def spectral_convergence(x_in, x_out, hps, epsilon=2e-3):
+    hps = DefaultSTFTValues(hps)
+    spec_in = spec(squeeze(x_in.float()), hps)
+    spec_out = spec(squeeze(x_out.float()), hps)
+
+    gt_norm = norm(spec_in)
+    residual_norm = norm(spec_in - spec_out)
+    mask = (gt_norm > epsilon).float()
+    return (residual_norm * mask) / torch.clamp(gt_norm, min=epsilon)
+
+
+class STFTValues:
+    def __init__(self, hps, n_fft, hop_length, window_size):
+        self.sr = hps.sr
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.window_size = window_size
+
+
+def multispectral_loss(x_in, x_out, hps):
+    losses = []
+    assert len(hps.multispec_loss_n_fft) == len(hps.multispec_loss_hop_length) == len(hps.multispec_loss_window_size)
+    args = [hps.multispec_loss_n_fft, hps.multispec_loss_hop_length, hps.multispec_loss_window_size]
+    for n_fft, hop_length, window_size in zip(*args):
+        hps = STFTValues(hps, n_fft, hop_length, window_size)
+        spec_in = spec(squeeze(x_in.float()), hps)
+        spec_out = spec(squeeze(x_out.float()), hps)
+        losses.append(norm(spec_in - spec_out))
+    return sum(losses) / len(losses)
+
+
+def average_metrics(_metrics):
+    metrics = {}
+    for _metric in _metrics:
+        for key, val in _metric.items():
+            if key not in metrics:
+                metrics[key] = []
+            metrics[key].append(val)
+    return {key: sum(vals) / len(vals) for key, vals in metrics.items()}
+
+
+class VQVAE(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if not config.sample_length:
+            downsamples = calculate_strides(config.vq_vae_strides_t, config.vq_vae_downs_t)
+            top_raw_to_tokens = np.prod(downsamples)
+            config.sample_length = (
+                config.sample_length_in_seconds * config.sr // top_raw_to_tokens
+            ) * top_raw_to_tokens
+
+        input_shape = (config.sample_length, 1)
+        block_kwargs = dict(
+            width=config.vq_vae_conv_block_width,
+            depth=config.vq_vae_conv_block_depth,
+            m_conv=config.vq_vae_m_conv,
+            dilation_growth_rate=config.vq_vae_dilation_growth_rate,
+            dilation_cycle=config.vq_vae_dilation_cycle,
+            reverse_decoder_dilation=config.vq_vae_reverse_decoder_dilation,
+        )
+
+        multipliers = config.vq_vae_multipliers
+        emb_width = config.vq_vae_emmbedding_width
+        self.width = config.vq_vae_width
+        self.depth = config.vq_vae_depth
+
+        self.downs_t = downs_t = config.vq_vae_downs_t
+        self.strides_t = strides_t = config.vq_vae_strides_t
+        self.l_bins = l_bins = config.vq_vae_codebook_dimension
+        self.commit = config.vq_vae_commit
+        self.spectral = config.spectral
+        self.multispectral = config.multispectral
+
+        self.sample_length = input_shape[0]
+        x_shape, x_channels = input_shape[:-1], input_shape[-1]
+        self.x_shape = x_shape
+
+        self.downsamples = calculate_strides(strides_t, downs_t)
+        self.hop_lengths = np.cumprod(self.downsamples)
+        self.levels = levels = config.vq_vae_levels
+        self.z_shapes = [(int(x_shape[0] // self.hop_lengths[-level - 1]),) for level in range(levels)]
+
+        if multipliers is None:
+            self.multipliers = [1] * levels
+        else:
+            assert len(multipliers) == levels, "Invalid number of multipliers"
+            self.multipliers = multipliers
+
+        def _block_kwargs(level):
+            this_block_kwargs = dict(block_kwargs)
+            this_block_kwargs["width"] *= self.multipliers[level]
+            this_block_kwargs["depth"] *= self.multipliers[level]
+            return this_block_kwargs
+
+        def encoder(level):
+            return Encoder(
+                x_channels, emb_width, level + 1, downs_t[: level + 1], strides_t[: level + 1], **_block_kwargs(level)
+            )
+
+        def decoder(level):
+            return Decoder(
+                x_channels, emb_width, level + 1, downs_t[: level + 1], strides_t[: level + 1], **_block_kwargs(level)
+            )
+
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        for level in range(levels):
+            self.encoders.append(encoder(level))
+            self.decoders.append(decoder(level))
+
+        self.bottleneck = Bottleneck(l_bins, emb_width, config.vq_vae_lmu, levels)
+
+    def preprocess(self, x):
+        # x: NTC [-1,1] -> NCT [-1,1]
+        assert len(x.shape) == 3
+        x = x.permute(0, 2, 1).float()
+        return x
+
+    def postprocess(self, x):
+        # x: NTC [-1,1] <- NCT [-1,1]
+        x = x.permute(0, 2, 1)
+        return x
+
+    def _decode(self, zs, start_level=0, end_level=None):
+        # Decode
+        if end_level is None:
+            end_level = self.levels
+        assert len(zs) == end_level - start_level
+        xs_quantised = self.bottleneck.decode(zs, start_level=start_level, end_level=end_level)
+        assert len(xs_quantised) == end_level - start_level
+
+        # Use only lowest level
+        decoder, x_quantised = self.decoders[start_level], xs_quantised[0:1]
+        x_out = decoder(x_quantised, all_levels=False)
+        x_out = self.postprocess(x_out)
+        return x_out
+
+    def decode(self, zs, start_level=0, end_level=None, bs_chunks=1):
+        z_chunks = [torch.chunk(z, bs_chunks, dim=0) for z in zs]
+        x_outs = []
+        for i in range(bs_chunks):
+            zs_i = [z_chunk[i] for z_chunk in z_chunks]
+            x_out = self._decode(zs_i, start_level=start_level, end_level=end_level)
+            x_outs.append(x_out)
+        return torch.cat(x_outs, dim=0)
+
+    def _encode(self, x, start_level=0, end_level=None):
+        # Encode
+        if end_level is None:
+            end_level = self.levels
+        x_in = self.preprocess(x)
+        xs = []
+        for level in range(self.levels):
+            encoder = self.encoders[level]
+            x_out = encoder(x_in)
+            xs.append(x_out[-1])
+        zs = self.bottleneck.encode(xs)
+        return zs[start_level:end_level]
+
+    def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
+        x_chunks = torch.chunk(x, bs_chunks, dim=0)
+        zs_list = []
+        for x_i in x_chunks:
+            zs_i = self._encode(x_i, start_level=start_level, end_level=end_level)
+            zs_list.append(zs_i)
+        zs = [torch.cat(zs_level_list, dim=0) for zs_level_list in zip(*zs_list)]
+        return zs
+
+    def sample(self, n_samples):
+        zs = [torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cuda") for z_shape in self.z_shapes]
+        return self.decode(zs)
+
+    def forward(self, x, hps, loss_fn="l1"):
+        metrics = {}
+
+        # N = x.shape[0]
+
+        # Encode/Decode
+        x_in = self.preprocess(x)
+        xs = []
+        for level in range(self.levels):
+            encoder = self.encoders[level]
+            x_out = encoder(x_in)
+            xs.append(x_out[-1])
+
+        zs, xs_quantised, commit_losses, quantiser_metrics = self.bottleneck(xs)
+        x_outs = []
+        for level in range(self.levels):
+            decoder = self.decoders[level]
+            x_out = decoder(xs_quantised[level : level + 1], all_levels=False)
+            # assert_shape(x_out, x_in.shape)
+            x_outs.append(x_out)
+
+        # Loss
+        def _spectral_loss(x_target, x_out, hps):
+            if hps.use_nonrelative_specloss:
+                sl = spectral_loss(x_target, x_out, hps) / hps.bandwidth["spec"]
+            else:
+                sl = spectral_convergence(x_target, x_out, hps)
+            sl = torch.mean(sl)
+            return sl
+
+        def _multispectral_loss(x_target, x_out, hps):
+            sl = multispectral_loss(x_target, x_out, hps) / hps.bandwidth["spec"]
+            sl = torch.mean(sl)
+            return sl
+
+        recons_loss = torch.zeros(()).to(x.device)
+        spec_loss = torch.zeros(()).to(x.device)
+        multispec_loss = torch.zeros(()).to(x.device)
+        # x_target = audio_postprocess(x.float(), hps)
+        x_target = x.float()
+
+        for level in reversed(range(self.levels)):
+            x_out = self.postprocess(x_outs[level])
+            # x_out = audio_postprocess(x_out, hps)
+            this_recons_loss = _loss_fn(loss_fn, x_target, x_out, hps)
+            this_spec_loss = _spectral_loss(x_target, x_out, hps)
+            this_multispec_loss = _multispectral_loss(x_target, x_out, hps)
+            metrics[f"recons_loss_l{level + 1}"] = this_recons_loss
+            metrics[f"spectral_loss_l{level + 1}"] = this_spec_loss
+            metrics[f"multispectral_loss_l{level + 1}"] = this_multispec_loss
+            recons_loss += this_recons_loss
+            spec_loss += this_spec_loss
+            multispec_loss += this_multispec_loss
+
+        commit_loss = sum(commit_losses)
+        loss = (
+            recons_loss + self.spectral * spec_loss + self.multispectral * multispec_loss + self.commit * commit_loss
+        )
+
+        with torch.no_grad():
+            sc = torch.mean(spectral_convergence(x_target, x_out, hps))
+            l2_loss = _loss_fn("l2", x_target, x_out, hps)
+            l1_loss = _loss_fn("l1", x_target, x_out, hps)
+            linf_loss = _loss_fn("linf", x_target, x_out, hps)
+
+        quantiser_metrics = average_metrics(quantiser_metrics)
+
+        metrics.update(
+            dict(
+                recons_loss=recons_loss,
+                spectral_loss=spec_loss,
+                multispectral_loss=multispec_loss,
+                spectral_convergence=sc,
+                l2_loss=l2_loss,
+                l1_loss=l1_loss,
+                linf_loss=linf_loss,
+                commit_loss=commit_loss,
+                **quantiser_metrics,
+            )
+        )
+
+        for key, val in metrics.items():
+            metrics[key] = val.detach()
+
+        return x_out, loss, metrics
+
+
+# Scalable transformer
+class JukeboxMLP(nn.Module):
+    def __init__(self, width, n_state, resid_dropout=0.0, afn="gelu", zero_out=False, init_scale=1.0):
+        # a single channel is always used in original code
+        super().__init__()
+        self.c_fc = Conv1D(width, n_state, init_scale=init_scale)
+        self.c_proj = Conv1D(n_state, width, zero_out, init_scale=init_scale)
+        self.act = ACT2FN[afn]
+        self.dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class LayerNorm(FusedLayerNorm):
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
+        self.width = np.prod(normalized_shape)
+        self.max_numel = 65535 * self.width
+
+    def forward(self, input):
+        if input.numel() > self.max_numel:
+            return F.layer_norm(input.float(), self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
+        else:
+            return super(LayerNorm, self).forward(input.float()).type_as(input)
+
+
+def repeat(x, n, dim):
+    if dim == -1:
+        dim = len(x.shape) - 1
+    return (
+        x.view(int(np.prod(x.shape[: dim + 1])), 1, int(np.prod(x.shape[dim + 1 :])))
+        .repeat(1, n, 1)
+        .view(*x.shape[:dim], n * x.shape[dim], *x.shape[dim + 1 :])
+    )
+
+
+def get_mask(mask, q_l, kv_l, blocks, spread, device, sample, sample_t):
+    # returns a mask of shape 1 x 1 x q_l x kv_l or None if masking is not needed.
+    if mask is None or q_l == 1:
+        return None
+    offset = sample_t - q_l if sample else max(kv_l - q_l, 0)
+    if mask == "autoregressive":
+        # Masked dense
+        mask = torch.ones(q_l, kv_l, device=device).tril(offset)
+    elif mask == "summary":
+        # Masked summary
+        mask = (
+            torch.nn.functional.pad(
+                torch.ones(q_l, q_l, device=device).tril().view(q_l, blocks, q_l // blocks)[:, :-1, -kv_l // blocks :],
+                (0, 0, 1, 0),
+                value=1,
+            )
+            .contiguous()
+            .view(q_l, kv_l)
+        )
+    elif mask == "prime":
+        mask = torch.ones(q_l, kv_l, device=device).tril(offset)
+    return mask.view(1, 1, q_l, kv_l)
+
+
+class JukeboxAttention(nn.Module):
+    # previously FactoredAttention
+    def __init__(
+        self,
+        width,
+        n_ctx,
+        n_state,
+        n_head,
+        attn_dropout=0.0,
+        resid_dropout=0.0,
+        scale=True,
+        mask=False,
+        zero_out=False,
+        init_scale=1.0,
+        checkpoint_attn=0,
+        attn_func=0,
+        blocks=None,
+        spread=None,
+        encoder_dims=None,
+        prime_len=None,
+    ):
+        super().__init__()
+        self.width = width  # should have a better name
+        self.n_ctx = n_ctx  # NOTE: n_ctx could be different within operations. This is complete n_ctx
+        self.n_state = n_state
+        assert n_state % n_head == 0
+        self.n_head = n_head
+        self.scale = scale
+        self.mask = mask
+        if attn_func == 6:
+            self.c_attn = Conv1D(width, n_state, init_scale=init_scale)
+            self.c_enc_kv = Conv1D(width, n_state * 2, init_scale=init_scale)
+        else:
+            self.c_attn = Conv1D(width, n_state * 3, init_scale=init_scale)
+        self.c_proj = Conv1D(n_state, width, zero_out, init_scale=init_scale)
+        self.attn_dropout = nn.Dropout(attn_dropout) if attn_dropout > 0.0 else lambda x: x
+        self.resid_dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
+
+        # Sequence of length l is factored as [blocks, l // blocks]
+        self.attn_func = attn_func
+        self.qkv, self.attn, self.attn_mask = {
+            0: (self.factored_qkv, self.dense_attn, "autoregressive"),  # Attend to all positions
+            1: (self.factored_qkv, self.block_attn, "autoregressive"),  # Attend to your block
+            2: (self.factored_qkv, self.transpose_block_attn, "autoregressive"),  # Attend to transpose block
+            3: (self.factored_qkv, self.prev_block_attn, None),  # Attend to previous block
+            4: (self.factored_qkv, self.summary_attn, "summary"),  # Attend to last position of each block
+            5: (self.factored_qkv, self.summary_spread_attn, "summary"),
+            6: (self.decode_qkv, self.decode_attn, None),
+            7: (self.prime_qkv, self.prime_attn, "prime"),
+        }[
+            attn_func
+        ]  # Attend to last k position of each block
+
+        self.blocks = blocks
+        self.spread = spread
+        if blocks is not None:
+            assert n_ctx % blocks == 0
+            self.block_ctx = n_ctx // blocks
+        self.checkpoint_attn = checkpoint_attn  # 0: None, 1: Attn after heads split, 2: Attn
+
+        self.sample_t = 0
+        self.cache = {}
+        self.encoder_dims = encoder_dims
+        self.prime_len = prime_len
+        self.record_attn = False
+        self.w = None
+
+    def _attn(self, q, k, v, sample):
+        scale = 1.0 / math.sqrt(math.sqrt(self.n_state // self.n_head))
+        if self.training:
+            w = torch.matmul(q * scale, k * scale)
+        else:
+            w = torch.matmul(q, k)
+            w.mul_(scale * scale)
+        wtype = w.dtype
+        w = w.float()
+        if self.mask:
+            # Generate appropriate mask to mask out all positions before current
+            # Might take up lot of memory for dense, so can cache it
+            mask = get_mask(
+                self.attn_mask, q.size(-2), k.size(-1), self.blocks, self.spread, w.device, sample, self.sample_t
+            )
+            if mask is not None:
+                # print(mask)
+                w = w * mask + -1e9 * (1 - mask)
+            w = F.softmax(w, dim=-1).type(wtype)
+        else:
+            w = F.softmax(w, dim=-1).type(wtype)
+        if self.record_attn:
+            self.w = w  # .float().cpu().numpy()
+            if self.attn_func == 7:
+                # only keep music queries and lyrics keys/values
+                self.w = self.w[:, :, self.prime_len :, : self.prime_len]
+        w = self.attn_dropout(w)
+        a = torch.matmul(w, v)
+        return a
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = (*x.size()[:-2], x.size(-2) * x.size(-1))
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = (*x.size()[:-1], self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def dense_attn(self, query, key, value, sample):
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        # if self.checkpoint_attn == 1 and not sample:
+        #     a = checkpoint(lambda q,k,v,s=sample: self._attn(q,k,v,s), (query, key, value),
+        #                (), True)
+        # else:
+        a = self._attn(query, key, value, sample)
+        a = self.merge_heads(a)
+        return a
+
+    def block_attn(self, q, k, v, sample):
+        _, block_ctx = (
+            self.blocks,
+            self.block_ctx,
+        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            assert l == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
+            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            ql = q.shape[1]
+            q = q.view(bs * ql // block_ctx, block_ctx, d)
+            if ql < l:
+                l = ql
+                k = k[:, -l:].contiguous()
+                v = v[:, -l:].contiguous()
+            k = k.view(bs * l // block_ctx, block_ctx, d)
+            v = v.view(bs * l // block_ctx, block_ctx, d)
+            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+
+    def transpose_block_attn(self, q, k, v, sample):
+        _, block_ctx = (
+            self.blocks,
+            self.block_ctx,
+        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            block_l = (l - 1) % block_ctx
+            k = k[:, block_l::block_ctx, :]
+            v = v[:, block_l::block_ctx, :]
+            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            ql = q.shape[1]
+            q = (
+                q.view(bs, ql // block_ctx, block_ctx, d)
+                .transpose(1, 2)
+                .contiguous()
+                .view(bs * block_ctx, ql // block_ctx, d)
+            )
+            k = (
+                k.view(bs, l // block_ctx, block_ctx, d)
+                .transpose(1, 2)
+                .contiguous()
+                .view(bs * block_ctx, l // block_ctx, d)
+            )
+            v = (
+                v.view(bs, l // block_ctx, block_ctx, d)
+                .transpose(1, 2)
+                .contiguous()
+                .view(bs * block_ctx, l // block_ctx, d)
+            )
+            return (
+                self.dense_attn(q, k, v, sample)
+                .view(bs, block_ctx, ql // block_ctx, d)
+                .transpose(1, 2)
+                .contiguous()
+                .view(bs, ql, d)
+            )
+
+    def prev_block_attn(self, q, k, v, sample):
+        _, block_ctx = (
+            self.blocks,
+            self.block_ctx,
+        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            assert l == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
+            block = (l - 1) // block_ctx
+            prev_l = (block - 1) * block_ctx
+            if block > 0:
+                assert prev_l == 0
+                k = k[:, prev_l : prev_l + block_ctx, :]
+                v = v[:, prev_l : prev_l + block_ctx, :]
+            else:
+                k = torch.zeros(bs, block_ctx, d, device=q.device, dtype=q.dtype)
+                v = torch.zeros(bs, block_ctx, d, device=q.device, dtype=q.dtype)
+            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            ql = q.shape[1]
+            q = q.view(bs * ql // block_ctx, block_ctx, d)
+            k = torch.nn.functional.pad(
+                k.view(bs, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
+            ).view(bs * l // block_ctx, block_ctx, d)
+            v = torch.nn.functional.pad(
+                v.view(bs, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
+            ).view(bs * l // block_ctx, block_ctx, d)
+            if ql < l:
+                qb = ql // block_ctx
+                kb = l // block_ctx
+                l = ql
+                k = k.view(bs, kb, block_ctx, d)[:, -qb:].contiguous().view(bs * qb, block_ctx, d)
+                v = v.view(bs, kb, block_ctx, d)[:, -qb:].contiguous().view(bs * qb, block_ctx, d)
+            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+
+    def summary_attn(self, q, k, v, sample):
+        blocks, block_ctx = (
+            self.blocks,
+            self.block_ctx,
+        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            k = torch.nn.functional.pad(k[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
+            v = torch.nn.functional.pad(v[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
+            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            k = torch.nn.functional.pad(
+                k.view(bs, blocks, l // blocks, d)[:, :-1, -1, :], (0, 0, 1, 0)
+            )  # bs, blocks, d
+            v = torch.nn.functional.pad(
+                v.view(bs, blocks, l // blocks, d)[:, :-1, -1, :], (0, 0, 1, 0)
+            )  # bs, blocks, d
+            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+
+    def summary_spread_attn(self, q, k, v, sample):
+        blocks, _, spread = (
+            self.blocks,
+            self.block_ctx,
+            self.spread,
+        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
+        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        if sample:
+            assert False, "Not yet implemented"
+            # k = torch.nn.functional.pad(k,(0,0,block_ctx,(-l)%block_ctx)).view(bs, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(bs, -1, d)
+            # v = torch.nn.functional.pad(v,(0,0,block_ctx,(-l)%block_ctx)).view(bs, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(bs, -1, d)
+            # return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+        else:
+            k = (
+                torch.nn.functional.pad(k.view(bs, blocks, l // blocks, d)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
+                .contiguous()
+                .view(bs, blocks * spread, d)
+            )  # bs, blocks * spread, d
+            v = (
+                torch.nn.functional.pad(v.view(bs, blocks, l // blocks, d)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
+                .contiguous()
+                .view(bs, blocks * spread, d)
+            )  # bs, blocks * spread, d
+            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+
+    def prime_attn(self, q, k, v, sample):
+        prime_len = self._prime_len
+        k = k[:, :prime_len]
+        v = v[:, :prime_len]
+        return self.dense_attn(q, k, v, sample)
+
+    def decode_attn(self, q, k, v, sample):
+        assert (
+            k.shape[1] == v.shape[1] == self.encoder_dims
+        ), f"k: {k.shape}, v: {v.shape}, enc_dims: {self.encoder_dims}"
+        return self.dense_attn(q, k, v, sample)
+
+    def factored_qkv(self, x, encoder_kv=None, sample=False):
+        curr_ctx = x.shape[1]
+        assert encoder_kv is None
+        query, key, value = x.chunk(3, dim=2)
+        if sample:
+            self.sample_t += curr_ctx
+            key, value = self._append_cache(key, value)
+            l_cache = self._suff_cache_len()
+            if self._cache_len() > l_cache:
+                self._slice_cache(-l_cache)
+            if curr_ctx > 1:
+                if self.attn_func != 0:
+                    query = self._pad_to_block_ctx(query, query=True)
+                    key = self._pad_to_block_ctx(key)
+                    value = self._pad_to_block_ctx(value)
+                    assert key.shape[1] % self.block_ctx == 0
+                    assert query.shape[1] % self.block_ctx == 0
+                assert key.shape[1] == value.shape[1]
+                assert query.shape[1] <= key.shape[1]
+                sample = False
+            else:
+                key = self.cache["key"]
+                value = self.cache["value"]
+        return query, key, value, sample
+
+    def prime_qkv(self, x, encoder_kv=None, sample=False):
+        curr_ctx = x.shape[1]
+        assert encoder_kv is None
+        query, key, value = x.chunk(3, dim=2)
+        if sample:
+            if self._cache_len() < self._prime_len:
+                self._append_cache(key, value)
+            if self._cache_len() > self._prime_len:
+                self._slice_cache(0, self._prime_len)
+            key, value = self.cache["key"], self.cache["value"]
+            self.sample_t += curr_ctx
+            assert (
+                key.shape[1] == value.shape[1] == self._suff_cache_len()
+            ), f"k: {key.shape}, v: {value.shape}, prime_dims: {self._suff_cache_len()}"
+        else:
+            assert (
+                key.shape[1] == value.shape[1] == self.n_ctx
+            ), f"k: {key.shape}, v: {value.shape}, prime_dims: {self.n_ctx}"
+        assert key.shape[0] == value.shape[0] == query.shape[0], f"k: {key.shape}, v: {value.shape}, q: {query.shape}"
+        assert key.shape[2] == value.shape[2] == query.shape[2], f"k: {key.shape}, v: {value.shape}, q: {query.shape}"
+        return query, key, value, sample
+
+    def decode_qkv(self, x, encoder_kv=None, sample=False):
+        curr_ctx = x.shape[1]
+        assert encoder_kv is not None
+        query = x
+        if sample:
+            if self.sample_t == 0:
+                self.cache["key"], self.cache["value"] = self.c_enc_kv(encoder_kv.type_as(x)).chunk(2, dim=2)
+            key, value = self.cache["key"], self.cache["value"]
+            self.sample_t += curr_ctx
+        else:
+            key, value = self.c_enc_kv(encoder_kv.type_as(x)).chunk(2, dim=2)
+        assert key.shape[0] == value.shape[0] == query.shape[0], f"k: {key.shape}, v: {value.shape}, q: {query.shape}"
+        assert (
+            key.shape[1] == value.shape[1] == self.encoder_dims
+        ), f"k: {key.shape}, v: {value.shape}, enc_dims: {self.encoder_dims}"
+        assert key.shape[2] == value.shape[2] == query.shape[2], f"k: {key.shape}, v: {value.shape}, q: {query.shape}"
+        return query, key, value, sample
+
+    def forward(self, x, encoder_kv=None, sample=False):
+        curr_ctx = x.shape[1]
+        x = self.c_attn(x)
+        query, key, value, sample = self.qkv(x, encoder_kv=encoder_kv, sample=sample)
+        # if self.checkpoint_attn == 2 and not sample:
+        #     a = checkpoint(lambda q,k,v,s=sample: self.attn(q,k,v,s), (query, key, value), (), True)
+        # else:
+        a = self.attn(query, key, value, sample)
+        if a.shape[1] != curr_ctx:
+            offset = self._offset(curr_ctx)
+            a = a[:, offset : offset + curr_ctx, :].contiguous()
+        a = self.c_proj(a)
+        return self.resid_dropout(a)
+
+    @property
+    def _prime_len(self):
+        prime_len = self.prime_len
+        assert prime_len is not None
+        prime_blocks = (prime_len // self.blocks) + 1
+        return prime_blocks * self.blocks
+
+    def _offset(self, curr_ctx):
+        if self.attn_func == 0:
+            return 0
+        return (self.sample_t - curr_ctx) % self.block_ctx
+
+    def _pad_to_block_ctx(self, x, query=False):
+        l = x.shape[1]
+        offset = self._offset(l) if query else 0
+        n_blocks = (l + offset + self.block_ctx - 1) // self.block_ctx
+        pad = n_blocks * self.block_ctx - l - offset
+        if pad == 0 and offset == 0:
+            return x
+        else:
+            return F.pad(x, (0, 0, offset, pad))
+
+    def _cache_len(self):
+        return 0 if "key" not in self.cache else self.cache["key"].shape[1]
+
+    def _suff_cache_len(self):
+        """
+        Precondition:
+            key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
+            location in the context.
+        """
+        if self.attn_func == 0:
+            return self.sample_t
+        elif self.attn_func == 1:
+            return (self.sample_t - 1) % self.block_ctx + 1
+        elif self.attn_func == 2:
+            return self.sample_t
+        elif self.attn_func == 3:
+            if self.sample_t <= self.block_ctx:
+                return self.sample_t
+            else:
+                curr_block = (self.sample_t - 1) % self.block_ctx + 1
+                prev_block = self.block_ctx
+                return curr_block + prev_block
+        elif self.attn_func == 6:
+            return self.encoder_dims
+        elif self.attn_func == 7:
+            return min(self.sample_t, self._prime_len)
+        else:
+            raise NotImplementedError()
+
+    def _slice_cache(self, start, end=None):
+        self.cache["key"] = self.cache["key"][:, start:end]
+        self.cache["value"] = self.cache["value"][:, start:end]
+
+    def _append_cache(self, key, value):
+        if "key" not in self.cache:
+            self.cache["key"] = key
+            self.cache["value"] = value
+        else:
+            old_key, old_value = key, value
+            key = torch.cat([self.cache["key"], old_key], dim=1)
+            value = torch.cat([self.cache["value"], old_value], dim=1)
+            del self.cache["key"]
+            del self.cache["value"]
+            del old_key
+            del old_value
+            self.cache["key"] = key
+            self.cache["value"] = value
+        return self.cache["key"], self.cache["value"]
+
+    def del_cache(self):
+        self.sample_t = 0
+        if "key" in self.cache:
+            del self.cache["key"]
+        if "value" in self.cache:
+            del self.cache["value"]
+        self.cache = {}
+
+    def check(self):
+        blocks = self.blocks or 1
+        spread = self.spread or 1
+        bs, l, d = (4, self.n_ctx, self.width)
+        x = torch.randn(bs, l, d).cpu()
+        x.requires_grad = True
+        x_out = self.forward(x)  # bs, l, d
+        loss = x_out.mean(dim=-1)  # bs, l
+        pos = 60
+        grad = torch.autograd.grad(loss[2, pos], x)[0]
+
+        assert grad.shape == (bs, l, d)
+        assert (grad[:2] == 0).all()
+        assert (grad[3:] == 0).all()
+        assert (grad[2, (pos + 1) :] == 0).all()
+        pos_grad = (torch.sum(grad[2] ** 2, dim=-1) > 0).nonzero().view(-1).cpu()
+
+        block_pos = pos - (pos % (l // blocks))
+        exp_pos_grad = {
+            0: torch.arange(pos),
+            1: torch.arange(block_pos, pos),
+            2: torch.arange(pos % (l // blocks), pos, l // blocks),
+            3: torch.arange(block_pos - l // blocks, block_pos),
+            4: torch.arange(l // blocks - 1, pos, l // blocks),
+            5: ((torch.arange(pos) % (l // blocks) >= (l // blocks - spread)) & (torch.arange(pos) < block_pos))
+            .nonzero()
+            .view(-1),
+        }[self.attn_func]
+        exp_pos_grad = torch.cat([exp_pos_grad, torch.tensor([pos])], dim=-1)
+
+        assert (len(pos_grad) == len(exp_pos_grad)) and (pos_grad == exp_pos_grad).all(), (
+            f"Expected pos grad {exp_pos_grad} got {pos_grad} for attn_func {self.attn_func} pos {pos} l {l} blocks"
+            f" {blocks}"
+        )
+
+    def check_cache(self, n_samples, sample_t, fp16):
+        assert self.sample_t == sample_t, f"{self.sample_t} != {sample_t}"
+        if sample_t == 0:
+            assert self.cache == {}
+        else:
+            dtype = {True: torch.float16, False: torch.float32}[fp16]
+            l_cache = self._suff_cache_len()
+            assert self.cache["key"].shape == (n_samples, l_cache, self.n_state)
+            assert self.cache["value"].shape == (n_samples, l_cache, self.n_state)
+            assert self.cache["key"].dtype == dtype, f"Expected {dtype}, got {self.cache['key'].dtype}"
+            assert self.cache["value"].dtype == dtype, f"Expected {dtype}, got {self.cache['value'].dtype}"
+
+    def check_sample(self):
+        torch.manual_seed(42)
+        bs, l, d = (4, self.n_ctx, self.width)
+        prime = 5
+        x = torch.randn(bs, l, d).cpu()
+        xs = torch.chunk(x, l, dim=1)
+        assert self.sample_t == 0
+        assert self.cache == {}
+
+        with torch.no_grad():
+            enc_l = self.encoder_dims
+            encoder_kv = None
+            if self.attn_func == 6:
+                encoder_kv = torch.randn(bs, enc_l, d).cpu()
+
+            # Normal path
+            x_out_normal = self.forward(x, encoder_kv=encoder_kv)
+
+            # Sampling path
+            x_out_sample = torch.cat(
+                [self.forward(xs[i], encoder_kv=encoder_kv, sample=True) for i in range(l)], dim=1
+            )
+        max_err = torch.max(torch.abs(x_out_sample - x_out_normal))
+        assert max_err < 1e-8, (
+            "Max sampling err is"
+            f" {max_err} {[i for i in range(l) if torch.max(torch.abs(x_out_sample - x_out_normal)[:,i,:]) > 1e-8]}"
+        )
+
+        with torch.no_grad():
+            x_out_normal = x_out_normal[:, :prime, :]
+            # Prime sampling path
+            self.del_cache()
+            x_out_sample = self.forward(x[:, :prime, :].contiguous(), encoder_kv=encoder_kv, sample=True)
+            self.check_cache(bs, prime, False)
+
+        max_err = torch.max(torch.abs(x_out_sample - x_out_normal))
+        assert max_err < 1e-8, (
+            "Max prime sampling err is"
+            f" {max_err} {[i for i in range(prime) if torch.max(torch.abs(x_out_sample - x_out_normal)[:,i,:]) > 1e-8]}"
+        )
+
+    def check_chunks(self, chunk_size):
+        torch.manual_seed(42)
+        bs, l, d = (4, self.n_ctx, self.width)
+        enc_l = self.encoder_dims
+        assert l % chunk_size == 0
+        n_chunks = l // chunk_size
+        with torch.no_grad():
+            encoder_kv = None
+            x = torch.randn(bs, l, d).cpu()
+            if self.attn_func == 6:
+                encoder_kv = torch.randn(bs, enc_l, d).cpu()
+
+            self.del_cache()
+            y_forw = self.forward(x, encoder_kv=encoder_kv, sample=False)
+            self.del_cache()
+            y_forw_sample = self.forward(x, encoder_kv=encoder_kv, sample=True)
+            max_err = torch.max(torch.abs(y_forw - y_forw_sample))
+            assert max_err <= 1e-6, (
+                "Max err is"
+                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(y_forw - y_forw_sample)[:, i, :]) > 1e-6]}"
+            )
+
+            self.del_cache()
+            x_chunks = torch.chunk(x, n_chunks, dim=1)
+            y_chunks = []
+            total_len = 0
+            for x_chunk in x_chunks:
+                y_chunk = self.forward(x_chunk.contiguous(), encoder_kv=encoder_kv, sample=True)
+                total_len += x_chunk.shape[1]
+                self.check_cache(bs, total_len, False)
+                y_chunks.append(y_chunk)
+            y_forw_in_chunks = torch.cat(y_chunks, dim=1)
+
+            max_err = torch.max(torch.abs(y_forw - y_forw_in_chunks))
+            assert max_err <= 1e-6, (
+                "Max err is"
+                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(y_forw - y_forw_in_chunks)[:, i, :]) > 1e-6]}"
+            )
+
+
+class JukeboxBlock(nn.Module):
+    # previously ResAttnBlock
+    def __init__(
+        self,
+        width,
+        n_ctx,
+        n_head,
+        attn_dropout=0.0,
+        resid_dropout=0.0,
+        afn="gelu",
+        scale=True,
+        mask=False,
+        zero_out=False,
+        init_scale=1.0,
+        res_scale=1.0,
+        m_attn=0.25,
+        m_mlp=1.0,
+        checkpoint_attn=0,
+        checkpoint_mlp=0,
+        attn_func=0,
+        blocks=None,
+        spread=None,
+        encoder_dims=None,
+        prime_len=None,
+    ):
+        super().__init__()
+        self.attn = JukeboxAttention(
+            width=width,
+            n_ctx=n_ctx,
+            n_state=int(m_attn * width),
+            n_head=n_head,
+            attn_dropout=attn_dropout,
+            resid_dropout=resid_dropout,
+            scale=scale,
+            mask=mask,
+            zero_out=zero_out,
+            init_scale=init_scale,
+            checkpoint_attn=checkpoint_attn,
+            attn_func=attn_func,
+            blocks=blocks,
+            spread=spread,
+            encoder_dims=encoder_dims,
+            prime_len=prime_len,
+        )
+        self.ln_0 = LayerNorm(width)
+        self.mlp = JukeboxMLP(
+            width=width,
+            n_state=int(m_mlp * width),
+            resid_dropout=resid_dropout,
+            afn=afn,
+            zero_out=zero_out,
+            init_scale=init_scale,
+        )
+        self.ln_1 = LayerNorm(width)
+        self.res_scale = res_scale
+
+        self.checkpoint_attn = checkpoint_attn
+        self.checkpoint_mlp = checkpoint_mlp
+        self.width = width
+        self.attn_func = attn_func
+
+    def forward(self, x, encoder_kv, sample=False):
+        if sample:
+            a = self.attn(self.ln_0(x), encoder_kv, sample)
+            m = self.mlp(self.ln_1(x + a))
+        else:
+            a = self.attn(self.ln_0(x), encoder_kv, sample)
+            m = self.mlp(self.ln_1(x + a))
+            # if self.attn_func == 6:
+            #     assert encoder_kv is not None
+            #     a = checkpoint(lambda _x,_enc_kv,_s=sample: self.attn(self.ln_0(_x),_enc_kv,_s),
+            #                    (x,encoder_kv),
+            #                    (*self.attn.parameters(), *self.ln_0.parameters()),
+            #                    self.checkpoint_attn == 3)  # 2 recomputes after the projections, and 1 recomputes after head splitting.
+            # else:
+            #     assert encoder_kv is None
+            #     a = checkpoint(lambda _x,_enc_kv=None,_s=sample: self.attn(self.ln_0(_x),_enc_kv,_s),
+            #                    (x,),
+            #                    (*self.attn.parameters(), *self.ln_0.parameters()),
+            #                    self.checkpoint_attn == 3)  # 2 recomputes after the projections, and 1 recomputes after head splitting.
+            # m = checkpoint(lambda _x: self.mlp(self.ln_1(_x)), (x + a,),
+            #                (*self.mlp.parameters(), *self.ln_1.parameters()),
+            #                self.checkpoint_mlp == 1)
+            pass
+        if self.res_scale == 1.0:
+            h = x + a + m
+        else:
+            h = x + self.res_scale * (a + m)
+        return h
+
+
+class JukeboxTransformer(nn.Module):
+    def __init__(
+        self,
+        width,
+        n_ctx,
+        n_head,
+        n_depth,
+        attn_dropout=0.0,
+        resid_dropout=0.0,
+        afn="gelu",
+        scale=True,
+        mask=False,
+        zero_out=False,
+        init_scale=1.0,
+        res_scale=False,
+        m_attn=0.25,
+        m_mlp=1.0,
+        checkpoint_attn=0,
+        checkpoint_mlp=0,
+        checkpoint_res=0,
+        attn_order=0,
+        blocks=None,
+        spread=None,
+        encoder_dims=None,
+        prime_len=None,
+    ):
+        super().__init__()
+        self.width = width
+        self.n_ctx = n_ctx
+        self.encoder_dims = encoder_dims
+        self.blocks = blocks
+        if blocks is not None:
+            assert n_ctx % blocks == 0
+            self.block_ctx = n_ctx // blocks
+        self.prime_len = prime_len
+        self.n_head = n_head
+
+        res_scale = 1.0 / n_depth if res_scale else 1.0
+
+        # Orders of attn_func
+        attn_func = {
+            0: lambda d: 0,  # Complete dense attn
+            1: lambda d: [1, 2][d % 2],  # Alternate row and column attn
+            2: lambda d: [1, 2, 3][d % 3],  # Alternate row, column and previous row attn
+            3: lambda d: [1, 4][d % 2],  # Alternate row and last column
+            4: lambda d: [1, 5][d % 2],  # Alternate row and last k columns
+            5: lambda d: [1, 4, 1, 1][d % 4],  # Alternate row, last column, row, row
+            6: lambda d: [1, 2, 3, 6][d % 4],
+            7: lambda d: [*[1, 2, 3] * 5, 6][d % 16],
+            8: lambda d: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][d % 10],  # Used by separated_enc_dec model with lyrics
+            9: lambda d: [1, 2, 3, 0][d % 4],
+            10: lambda d: [*[1, 2, 3, 1, 2, 3, 1, 2, 3], *[1, 2, 3, 1, 2, 3, 1, 2, 3, 6] * 7][
+                d % 79
+            ],  # Used by large separated_enc_dec model with lyrics
+            11: lambda d: [6, 6, 0][d % 3] if d % 16 == 15 else [1, 2, 3][d % 3],
+            12: lambda d: [7, 7, 0][d % 3]
+            if d % 16 == 15
+            else [1, 2, 3][d % 3],  # Used by single_enc_dec model with lyrics
+        }[attn_order]
+
+        # attn_cycle = {0: 1, 1: 2, 2: 3, 3: 2, 4: 2, 5: 4, 6: 4, 7: 16, 8: 10, 9: 4, 10: 79, 11: 16, 12: 16}[attn_order]
+        # assert n_depth % attn_cycle == 0, f'Depth {n_depth} not a multiple of cycle {attn_cycle} for attn_order {attn_order}'
+
+        def attn_block(d):
+            return JukeboxBlock(
+                width=width,
+                n_ctx=n_ctx,
+                n_head=n_head,
+                attn_dropout=attn_dropout,
+                resid_dropout=resid_dropout,
+                afn=afn,
+                scale=scale,
+                mask=mask,
+                zero_out=zero_out if attn_func(d) != 6 else True,
+                init_scale=init_scale,
+                res_scale=res_scale,
+                m_attn=m_attn,
+                m_mlp=m_mlp,
+                checkpoint_attn=checkpoint_attn,
+                checkpoint_mlp=checkpoint_mlp,
+                attn_func=attn_func(d),
+                blocks=blocks,
+                spread=spread,
+                encoder_dims=encoder_dims,
+                prime_len=prime_len,
+            )
+
+        self.checkpoint_res = checkpoint_res
+        self._attn_mods = nn.ModuleList()
+        for d in range(n_depth):
+            self._attn_mods.append(attn_block(d))
+        self.ws = []
+
+    def set_record_attn(self, record_attn):
+        """
+        Arguments:
+            record_attn (bool or set): Makes forward prop dump self-attention
+                softmaxes to self.ws. Either a set of layer indices indicating which layers to store, or a boolean
+                value indicating whether to dump all.
+        """
+
+        def _should_record_attn(layer_idx):
+            if isinstance(record_attn, bool):
+                return record_attn
+            return layer_idx in record_attn
+
+        for i, l in enumerate(self._attn_mods):
+            l.attn.record_attn = _should_record_attn(i)
+        if record_attn:
+            assert self.ws == []
+            for l in self._attn_mods:
+                assert l.attn.w is None
+        else:
+            self.ws = []
+            for l in self._attn_mods:
+                l.attn.w = None
+
+    def forward(self, x, encoder_kv=None, sample=False, fp16=False, fp16_out=False):
+        if fp16:
+            x = x.half()
+
+        # Blocks
+        for i, l in enumerate(self._attn_mods):
+            # if self.checkpoint_res == 1 and not sample:
+            #     if l.attn_func == 6:
+            #         assert encoder_kv is not None
+            #         f = functools.partial(l, sample=sample)
+            #         x = checkpoint(f, (x, encoder_kv), l.parameters(), True)
+            #     else:
+            #         f = functools.partial(l, encoder_kv=None, sample=sample)
+            #         x = checkpoint(f, (x,), l.parameters(), True)
+            # else:
+            if l.attn_func == 6:
+                x = l(x, encoder_kv=encoder_kv, sample=sample)
+            else:
+                x = l(x, encoder_kv=None, sample=sample)
+            if l.attn.record_attn:
+                self.ws.append(l.attn.w)
+        if not fp16_out:
+            x = x.float()
+        return x
+
+    def check_cache(self, n_samples, sample_t, fp16):
+        for l in self._attn_mods:
+            l.attn.check_cache(n_samples, sample_t, fp16)
+
+    def del_cache(self):
+        for l in self._attn_mods:
+            l.attn.del_cache()
+
+    def check_sample(self):
+        bs, l, s, d = (4, self.n_ctx, self.encoder_dims, self.width)
+        # prime = 5
+        with torch.no_grad():
+            encoder_kv = torch.randn(bs, s, d).cpu()
+            x = torch.randn(bs, l, d).cpu()
+            y_forw = self.forward(x, encoder_kv=encoder_kv, sample=True)
+
+            self.del_cache()
+            x_chunks = torch.chunk(x, 4, dim=1)
+            y_chunks = []
+            n = 0
+            for x_chunk in x_chunks:
+                self.check_cache(bs, n, False)
+                y_chunk = self.forward(x_chunk, encoder_kv=encoder_kv, sample=True)
+                y_chunks.append(y_chunk)
+                n += x_chunk.shape[1]
+            self.check_cache(bs, n, False)
+            y_forw_in_chunks = torch.cat(y_chunks, dim=1)
+
+            max_err = torch.max(torch.abs(y_forw - y_forw_in_chunks))
+            assert max_err <= 1e-6, (
+                "Max err is"
+                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(y_forw - y_forw_in_chunks)[:, i, :]) > 1e-6]}"
+            )
+
+
+class PositionEmbedding(nn.Module):
+    def __init__(self, input_shape, width, init_scale=1.0, pos_init=False):
+        super().__init__()
+        self.input_shape = input_shape
+        self.input_dims = input_dims = np.prod(input_shape)
+        self.pos_init = pos_init
+        # if pos_init:
+        #    self.register_buffer("pos", torch.tensor(get_pos_idx(input_shape)).long())
+        #    self._pos_embs = nn.ModuleList()
+        #    for i in range(len(input_shape)):
+        #        emb = nn.Embedding(input_shape[i], width)
+        #        nn.init.normal_(emb.weight, std=0.02)
+        #        self._pos_embs.append(emb)
+        # else:
+        self.pos_emb = nn.Parameter(get_normal(input_dims, width, std=0.01 * init_scale))
+
+    def forward(self):
+        if self.pos_init:
+            pos_emb = sum([self._pos_embs[i](self.pos[:, i]) for i in range(len(self.input_shape))])
+        else:
+            pos_emb = self.pos_emb
+        return pos_emb
+
+
+class JukeboxConditionalAutoregressive(nn.Module):
+    # previously ConditionalAutoregressive2D, renamed it to prior
+    def __init__(
+        self,
+        input_shape,
+        bins,
+        width=128,
+        depth=2,
+        heads=1,
+        attn_dropout=0.0,
+        resid_dropout=0.0,
+        emb_dropout=0.0,
+        mask=True,
+        zero_out=False,
+        init_scale=1.0,
+        res_scale=False,
+        pos_init=False,
+        m_attn=0.25,
+        m_mlp=1,
+        checkpoint_res=0,
+        checkpoint_attn=0,
+        checkpoint_mlp=0,
+        attn_order=0,
+        blocks=None,
+        spread=None,
+        x_cond=False,
+        y_cond=False,
+        encoder_dims=0,
+        only_encode=False,
+        merged_decoder=False,
+        prime_len=None,
+    ):
+        super().__init__()
+        self.input_shape = input_shape
+        self.input_dims = input_dims = np.prod(input_shape)
+        self.encoder_dims = encoder_dims
+        self.bins = bins
+        self.width = width
+        self.depth = depth
+
+        self.x_emb = nn.Embedding(bins, width)
+        nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
+        self.x_emb_dropout = nn.Dropout(emb_dropout)
+        self.y_cond = y_cond
+        self.x_cond = x_cond
+        if not y_cond:
+            self.start_token = nn.Parameter(get_normal(1, width, std=0.01 * init_scale))
+
+        self.pos_emb = PositionEmbedding(
+            input_shape=input_shape, width=width, init_scale=init_scale, pos_init=pos_init
+        )
+        self.pos_emb_dropout = nn.Dropout(emb_dropout)
+
+        self.transformer = JukeboxTransformer(
+            width=width,
+            n_ctx=input_dims,
+            n_head=heads,
+            n_depth=depth,
+            attn_dropout=attn_dropout,
+            resid_dropout=resid_dropout,
+            afn="relu",
+            scale=True,
+            mask=mask,
+            zero_out=zero_out,
+            init_scale=init_scale,
+            res_scale=res_scale,
+            m_attn=m_attn,
+            m_mlp=m_mlp,
+            checkpoint_attn=checkpoint_attn,
+            checkpoint_mlp=checkpoint_mlp,
+            checkpoint_res=checkpoint_res,
+            attn_order=attn_order,
+            blocks=blocks,
+            spread=spread,
+            encoder_dims=encoder_dims,
+            prime_len=prime_len,
+        )
+
+        self.only_encode = only_encode
+        self.prime_len = prime_len
+        if merged_decoder:
+            # Merged piped model uses this setup
+            self.add_cond_after_transformer = False
+            self.share_x_emb_x_out = False
+        else:
+            self.add_cond_after_transformer = True
+            self.share_x_emb_x_out = True
+
+        if not only_encode:
+            self.x_out = nn.Linear(width, bins, bias=False)
+            if self.share_x_emb_x_out:
+                self.x_out.weight = self.x_emb.weight
+            self.loss = torch.nn.CrossEntropyLoss()
+
+    def preprocess(self, x):
+        # Input: x is NHWC and uint8. Converted to NL and long
+        # Can include stuff like bitpacking, reordering here.
+        N = x.shape[0]
+        return x.view(N, -1).long()
+
+    def postprocess(self, x, sample_tokens=None):
+        # Convert back from NL and long to NHWC
+        N = x.shape[0]
+        assert (0 <= x).all() and (x < self.bins).all()
+        if sample_tokens is None or sample_tokens == self.input_dims:
+            return x.view(N, *self.input_shape)
+        else:
+            return x.view(N, -1)
+
+    def forward(
+        self,
+        x,
+        x_cond=None,
+        y_cond=None,
+        encoder_kv=None,
+        fp16=False,
+        loss_full=False,
+        encode=False,
+        get_preds=False,
+        get_acts=False,
+        get_sep_loss=False,
+    ):
+        # Preprocess.
+        with torch.no_grad():
+            x = self.preprocess(x)
+
+        N, D = x.shape
+        # assert isinstance(x, torch.cuda.LongTensor)
+        assert (0 <= x).all() and (x < self.bins).all()
+
+        if self.y_cond:
+            assert y_cond is not None
+            assert y_cond.shape == (N, 1, self.width)
+        else:
+            assert y_cond is None
+
+        if self.x_cond:
+            assert x_cond is not None
+            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
+                N,
+                1,
+                self.width,
+            ), (
+                f"{x_cond.shape} != {(N, D, self.width)} nor {(N, 1, self.width)}. Did you pass the correct"
+                " --sample_length?"
+            )
+        else:
+            assert x_cond is None
+            x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
+
+        x_t = x  # Target
+        x = self.x_emb(x)  # X emb
+        x = roll(x, 1)  # Shift by 1, and fill in start token
+        if self.y_cond:
+            x[:, 0] = y_cond.view(N, self.width)
+        else:
+            x[:, 0] = self.start_token
+
+        x = self.x_emb_dropout(x) + self.pos_emb_dropout(self.pos_emb()) + x_cond  # Pos emb and dropout
+
+        x = self.transformer(x, encoder_kv=encoder_kv, fp16=fp16)  # Transformer
+        if self.add_cond_after_transformer:  # Piped doesnt add x_cond
+            x = x + x_cond
+
+        acts = x
+        if self.only_encode:
+            return x
+        x = self.x_out(x)  # Predictions
+
+        if get_sep_loss:
+            assert self.prime_len is not None
+            x_prime = x[:, : self.prime_len].reshape(-1, self.bins)
+            x_gen = x[:, self.prime_len :].reshape(-1, self.bins)
+
+            prime_loss = F.cross_entropy(x_prime, x_t[:, : self.prime_len].reshape(-1)) / np.log(2.0)
+            gen_loss = F.cross_entropy(x_gen, x_t[:, self.prime_len :].reshape(-1)) / np.log(2.0)
+
+            loss = (prime_loss, gen_loss)  # Note order! Prime is first
+        else:
+            loss = F.cross_entropy(x.view(-1, self.bins), x_t.view(-1)) / np.log(2.0)  # Loss
+
+        if get_preds:
+            return loss, x
+        elif get_acts:
+            return loss, acts
+        else:
+            return loss, None
+
+    def get_emb(self, sample_t, n_samples, x, x_cond, y_cond):
+        N, D = n_samples, self.input_dims
+        if sample_t == 0:
+            # Fill in start token
+            # x = torch.empty(n_samples, 1, self.width).cuda()
+            x = torch.empty(n_samples, 1, self.width).to(x_cond.device)
+
+            if self.y_cond:
+                x[:, 0] = y_cond.view(N, self.width)
+            else:
+                x[:, 0] = self.start_token
+        else:
+            # assert isinstance(x, torch.cuda.LongTensor)
+            assert (0 <= x).all() and (x < self.bins).all()
+            x = self.x_emb(x)
+        assert x.shape == (n_samples, 1, self.width)
+        if x_cond.shape == (N, D, self.width):
+            cond = x_cond[:, sample_t : sample_t + 1, :]
+        else:
+            cond = x_cond
+        x = x + self.pos_emb()[sample_t : sample_t + 1] + cond  # Pos emb, dropout is identity at eval time
+        assert x.shape == (n_samples, 1, self.width)
+        return x, cond
+
+    def sample(
+        self,
+        n_samples,
+        x_cond=None,
+        y_cond=None,
+        encoder_kv=None,
+        fp16=False,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        get_preds=False,
+        sample_tokens=None,
+    ):
+        assert self.training is False
+
+        if sample_tokens is None:
+            sample_tokens = self.input_dims
+        N, D = n_samples, self.input_dims
+        if self.y_cond:
+            assert y_cond is not None
+            assert y_cond.shape == (N, 1, self.width)
+        else:
+            assert y_cond is None
+
+        if self.x_cond:
+            assert x_cond is not None
+            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
+                N,
+                1,
+                self.width,
+            ), f"Got {x_cond.shape}, expected ({N}, {D}/{1}, {self.width})"
+        else:
+            assert x_cond is None
+            x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(
+                "cuda" if torch.cuda.is_available() else "cpu"
+            )
+
+        with torch.no_grad():
+            xs, x = [], None
+            if get_preds:
+                preds = []
+            # for sample_t in get_range(range(0, sample_tokens)):
+            for sample_t in range(0, sample_tokens):
+
+                x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
+                self.transformer.check_cache(n_samples, sample_t, fp16)
+                x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16)  # Transformer
+                if self.add_cond_after_transformer:
+                    x = x + cond
+                assert x.shape == (n_samples, 1, self.width)
+                x = self.x_out(x)  # Predictions
+                if get_preds:
+                    preds.append(x.clone())
+                # Adjust logits
+                x = x / temp
+                x = filter_logits(x, top_k=top_k, top_p=top_p)
+                x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
+                assert x.shape == (n_samples, 1)
+                xs.append(x.clone())
+
+            del x
+            self.transformer.del_cache()
+
+            x = torch.cat(xs, dim=1)
+            if get_preds:
+                preds = torch.cat(preds, dim=1)
+            x = self.postprocess(x, sample_tokens)
+        if get_preds:
+            return x, preds
+        else:
+            return x
+
+    def primed_sample(
+        self,
+        n_samples,
+        x,
+        x_cond=None,
+        y_cond=None,
+        encoder_kv=None,
+        fp16=False,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        get_preds=False,
+        chunk_size=None,
+        sample_tokens=None,
+    ):
+
+        if sample_tokens is None:
+            sample_tokens = self.input_dims
+        # Preprocess.
+        with torch.no_grad():
+            x = self.preprocess(x)
+        # assert isinstance(x, torch.cuda.LongTensor)
+        assert (0 <= x).all() and (x < self.bins).all()
+        assert x.shape[0] == n_samples
+        xs = torch.split(x, 1, dim=1)
+        xs = list(xs)
+        assert len(xs) < sample_tokens
+
+        N, D = n_samples, self.input_dims
+        if self.y_cond:
+            assert y_cond is not None
+            assert y_cond.shape == (N, 1, self.width)
+        else:
+            assert y_cond is None
+
+        if self.x_cond:
+            assert x_cond is not None
+            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
+                N,
+                1,
+                self.width,
+            ), f"Got {x_cond.shape}, expected ({N}, {D}/{1}, {self.width})"
+        else:
+            assert x_cond is None
+            x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(x.device)  # .cuda()
+
+        with torch.no_grad():
+            if get_preds:
+                preds = []
+
+            # Fill up key/value cache for past context by runing forward pass.
+            # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage.
+            if chunk_size is None:
+                chunk_size = len(xs)
+            # assert len(xs) % chunk_size == 0, f'expected {len(xs)} to be divisible by {chunk_size}'
+            chunk_sizes = split_chunks(len(xs), chunk_size)
+            x_primes = []
+            start = 0
+            x = None
+            # for current_chunk_size in get_range(chunk_sizes):
+            for current_chunk_size in chunk_sizes:
+
+                xs_prime, conds_prime = [], []
+                for sample_t in range(start, start + current_chunk_size):
+                    x_prime, cond_prime = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
+                    x = xs[sample_t]
+                    xs_prime.append(x_prime)
+                    conds_prime.append(cond_prime)
+                start = start + current_chunk_size
+
+                x_prime, cond_prime = torch.cat(xs_prime, dim=1), torch.cat(conds_prime, dim=1)
+                assert x_prime.shape == (n_samples, current_chunk_size, self.width)
+                assert cond_prime.shape == (n_samples, current_chunk_size, self.width)
+                del xs_prime
+                del conds_prime
+                if not get_preds:
+                    del cond_prime
+                x_prime = self.transformer(x_prime, encoder_kv=encoder_kv, sample=True, fp16=fp16)
+
+                if get_preds:
+                    if self.add_cond_after_transformer:
+                        x_prime = x_prime + cond_prime
+                    assert x_prime.shape == (n_samples, current_chunk_size, self.width)
+                    del cond_prime
+                    x_primes.append(x_prime)
+                else:
+                    del x_prime
+
+            if get_preds:
+                x_prime = torch.cat(x_primes, dim=1)
+                assert x_prime.shape == (n_samples, len(xs), self.width)
+                x_prime = self.x_out(x_prime)  # Predictions
+                preds.append(x_prime)
+
+            empty_cache()
+            self.transformer.check_cache(n_samples, len(xs), fp16)
+
+            x = xs[-1]
+            assert x.shape == (n_samples, 1)
+            empty_cache()
+            # for sample_t in get_range(range(len(xs), sample_tokens)):
+            for sample_t in range(len(xs), sample_tokens):
+
+                x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
+                self.transformer.check_cache(n_samples, sample_t, fp16)
+                x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16)  # Transformer
+                if self.add_cond_after_transformer:
+                    x = x + cond
+                assert x.shape == (n_samples, 1, self.width)
+                x = self.x_out(x)  # Predictions
+                if get_preds:
+                    preds.append(x)
+                # Adjust logits
+                x = x / temp
+                x = filter_logits(x, top_k=top_k, top_p=top_p)
+                x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
+                assert x.shape == (n_samples, 1)
+                xs.append(x.clone())
+
+            del x
+            self.transformer.del_cache()
+
+            x = torch.cat(xs, dim=1)
+            if get_preds:
+                preds = torch.cat(preds, dim=1)
+            x = self.postprocess(x, sample_tokens)
+        if get_preds:
+            return x, preds
+        else:
+            return x
+
+    def check_sample(self, chunk_size):
+        bs, l, d = (4, self.input_dims, self.width)
+        prime = int(self.input_dims // 8 * 7)
+        enc_l = self.encoder_dims
+        with torch.no_grad():
+            y_cond = torch.randn(bs, 1, d).cpu() if self.y_cond else None
+            x_cond = torch.randn(bs, l, d).cpu() if self.x_cond else None
+            encoder_kv = torch.randn(bs, enc_l, d).cpu()
+
+            x, preds_sample = self.sample(bs, x_cond, y_cond, encoder_kv, get_preds=True)
+            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
+            max_err = torch.max(torch.abs(preds_sample - preds_forw))
+            assert max_err <= 1e-6, (
+                "Max err is"
+                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
+            )
+
+            x_prime = x.view(bs, -1)[:, :prime]
+            # unchunked
+            x, preds_sample = self.primed_sample(bs, x_prime.clone(), x_cond, y_cond, encoder_kv, get_preds=True)
+            assert (x.view(bs, -1)[:, :prime] == x_prime).all(), "Priming samples don't match"
+            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
+            max_err = torch.max(torch.abs(preds_sample - preds_forw))
+            assert max_err <= 1e-6, (
+                "Max err is"
+                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
+            )
+
+            # chunked
+            x, preds_sample = self.primed_sample(
+                bs, x_prime.clone(), x_cond, y_cond, encoder_kv, get_preds=True, chunk_size=chunk_size
+            )
+            assert (x.view(bs, -1)[:, :prime] == x_prime).all(), "Priming samples don't match"
+            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
+            max_err = torch.max(torch.abs(preds_sample - preds_forw))
+            assert max_err <= 1e-6, (
+                "Max err is"
+                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
+            )
+
+
+def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (vocabulary size)
+        top_k >0: keep only top k tokens with highest probability (top-k filtering).
+        top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+    """
+    # assert logits.dim() == 2  # batch size 1 for now - could be updated for more but the code would be less clear
+    logits = logits.clone()
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    assert (top_k == 0) or (top_p == 0.0)
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k, dim=-1)[0][..., -1:]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        indices_to_remove = torch.zeros_like(logits, dtype=torch.uint8).scatter_(
+            dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+        )
+        logits[indices_to_remove] = filter_value
+    return logits
+
+
+def get_normal(*shape, std=0.01):
+    w = torch.empty(shape)
+    nn.init.normal_(w, std=std)
+    return w
+
+
+def roll(x, n):
+    return torch.cat((x[:, -n:], x[:, :-n]), dim=1)
+
+
+def split_chunks(length, chunk_size):
+    n_passes = (length + chunk_size - 1) // chunk_size
+    chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
+    assert sum(chunk_sizes) == length
+    return chunk_sizes
+
+
+# Conditioners
+
+
+class MusicTokenConditioner(nn.Module):
+    """
+    The MusicTokenConditioner takes music tokens as an input (coresponding to vocabularies in the VQ-VAE codebook) and
+    upsamples it using a single layer of decoder convolution block (the same is used in the VQ-VAE).
+
+    The tokens are passed through an embedding layer and the embeddings are upsampled.
+
+    """
+
+    def __init__(
+        self, input_shape, bins, down_t, stride_t, out_width, init_scale, zero_out, res_scale, **block_kwargs
+    ):
+        super().__init__()
+        self.x_shape = input_shape
+
+        # Embedding
+        self.width = out_width
+        self.x_emb = nn.Embedding(bins, out_width)
+        nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
+
+        # MusicTokenConditioner, takes as input either uper level tokens or raw audio? #TODO check that
+        self.cond = DecoderConvBock(
+            self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale
+        )
+        self.ln = LayerNorm(self.width)
+
+    def preprocess(self, x):
+        x = x.permute(0, 2, 1)  # NTC -> NCT
+        return x
+
+    def postprocess(self, x):
+        x = x.permute(0, 2, 1)  # NCT -> NTC
+        return x
+
+    def forward(self, x, x_cond=None):
+        # N = x.shape[0]
+        # assert_shape(x, (N, *self.x_shape))
+        if x_cond is not None:
+            # assert_shape(x_cond, (N, *self.x_shape, self.width))
+            pass
+        else:
+            x_cond = 0.0
+        # Embed x
+        x = x.long()
+        x = self.x_emb(x)
+        # assert_shape(x, (N, *self.x_shape, self.width))
+        x = x + x_cond
+
+        # Run conditioner
+        x = self.preprocess(x)
+        x = self.cond(x)
+        x = self.postprocess(x)
+        x = self.ln(x)
+        return x
+
+
+def flip(x):
+    def _flip(x):
+        return x.permute(0, 2, 1).contiguous()
+
+    if isinstance(x, (list, tuple)):
+        return [flip(z) for z in x]
+    return _flip(x)
+
+
+class SimpleEmbedding(nn.Module):
+    def __init__(self, bins, out_width, init_scale):
+        super().__init__()
+        self.bins = bins
+        self.emb = nn.Embedding(bins, out_width)
+        nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
+
+    def forward(self, y):
+        assert len(y.shape) == 2, f"Expected shape with 2 dims, got {y.shape}"
+        # assert isinstance(y, torch.cuda.LongTensor), f"Expected dtype {t.cuda.LongTensor}, got {y.dtype}        assert (0 <= y).all() and (y < self.bins).all(), f"Bins {self.bins}, got label {y}"
+        return self.emb(y)
+
+
+class RangeEmbedding(nn.Module):
+    # Interpolating
+    # Interpolate so that [pos_start, pos_end] <-> position tensor of length n_ctx
+    #
+    # Binning
+    # For each pos in position tensor, find its bin
+    # [start,end) mapped to [0,1,...,bins-1]
+    # [start,end) -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1]
+    # NOTE: Open ended interval on right, so start <= pos < end, not <= end
+    def __init__(self, n_time, bins, range, out_width, init_scale, clamp=False):
+        super().__init__()
+        self.n_time = n_time
+        self.bins = bins
+        self.emb = nn.Embedding(bins, out_width)
+        nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
+        self.pos_min, self.pos_max = range
+        self.clamp = clamp
+
+    def forward(self, pos_start, pos_end=None):
+        # Check if [pos_start,pos_end] in [pos_min, pos_max)
+        assert len(pos_start.shape) == 2, f"Expected shape with 2 dims, got {pos_start.shape}"
+        assert (self.pos_min <= pos_start).all() and (
+            pos_start < self.pos_max
+        ).all(), f"Range is [{self.pos_min},{self.pos_max}), got {pos_start}"
+        pos_start = pos_start.float()
+        if pos_end is not None:
+            assert len(pos_end.shape) == 2, f"Expected shape with 2 dims, got {pos_end.shape}"
+            if self.clamp:
+                pos_end = pos_end.clamp(self.pos_min, self.pos_max)
+            assert (self.pos_min <= pos_end).all() and (
+                pos_end <= self.pos_max
+            ).all(), f"Range is [{self.pos_min},{self.pos_max}), got {pos_end}"
+            pos_end = pos_end.float()
+        # Interpolate so that [pos_start, ..., pos_end] <-> position tensor of length n_ctx
+        n_time = self.n_time
+        if n_time != 1:
+            assert pos_end is not None
+            interpolation = (
+                torch.arange(0, n_time, dtype=torch.float, device=pos_start.device).view(1, n_time) / n_time
+            )
+            position = pos_start + (pos_end - pos_start) * interpolation
+        else:
+            position = pos_start
+
+        # Bin each value to bins
+        normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min)  # [0,1)
+        bins = (self.bins * normalised_position).floor().long().detach()  # [0,1) -> [0,1..,bins) -> [0,1...,bins-1]
+        return self.emb(bins)
+
+
+class LabelConditioner(nn.Module):
+    def __init__(
+        self,
+        y_bins,
+        t_bins,
+        sr,
+        min_duration,
+        max_duration,
+        n_time,
+        out_width,
+        init_scale,
+        max_bow_genre_size,
+        include_time_signal,
+    ):
+        super().__init__()
+        self.n_time = n_time
+        self.out_width = out_width
+        assert len(y_bins) == 2, f"Expecting (genre, artist) bins, got {y_bins}"
+        bow_genre_bins, artist_bins = y_bins
+        self.max_bow_genre_size = max_bow_genre_size
+        self.bow_genre_emb = SimpleEmbedding(bow_genre_bins, out_width, init_scale)
+        self.artist_emb = SimpleEmbedding(artist_bins, out_width, init_scale)
+        self.include_time_signal = include_time_signal
+        if self.include_time_signal:
+            t_ranges = (
+                (min_duration * sr, max_duration * sr),  # Total length
+                (0.0, max_duration * sr),  # Absolute pos
+                (0.0, 1.0),
+            )  # Relative pos
+            assert len(t_ranges) == 3, f"Expecting (total, absolute, relative) ranges, got {t_ranges}"
+            total_length_range, absolute_pos_range, relative_pos_range = t_ranges
+            self.total_length_emb = RangeEmbedding(1, t_bins, total_length_range, out_width, init_scale)
+            self.absolute_pos_emb = RangeEmbedding(n_time, t_bins, absolute_pos_range, out_width, init_scale)
+            self.relative_pos_emb = RangeEmbedding(
+                n_time, t_bins, relative_pos_range, out_width, init_scale, clamp=True
+            )
+
+    def forward(self, y):
+        assert len(y.shape) == 2, f"Expected shape with 2 dims, got {y.shape}"
+        assert (
+            y.shape[-1] == 4 + self.max_bow_genre_size
+        ), f"Expected shape (N,{4 + self.max_bow_genre_size}), got {y.shape}"
+        # assert isinstance(y, torch.cuda.LongTensor), f"Expected dtype {t.cuda.LongTensor}, got {y.dtype}"
+        # N = y.shape[0]
+        total_length, offset, length, artist, genre = y[:, 0:1], y[:, 1:2], y[:, 2:3], y[:, 3:4], y[:, 4:]
+
+        # Start embedding of length 1
+        artist_emb = self.artist_emb(artist)
+        # Empty genre slots are denoted by -1. We mask these out.
+        mask = (genre >= 0).float().unsqueeze(2)
+        genre_emb = (self.bow_genre_emb(genre.clamp(0)) * mask).sum(dim=1, keepdim=True)
+        start_emb = genre_emb + artist_emb
+        # assert_shape(start_emb, (N, 1, self.out_width))
+
+        # Pos embedding of length n_ctx
+        if self.include_time_signal:
+            start, end = offset, offset + length
+            total_length, start, end = total_length.float(), start.float(), end.float()
+            pos_emb = (
+                self.total_length_emb(total_length)
+                + self.absolute_pos_emb(start, end)
+                + self.relative_pos_emb(start / total_length, end / total_length)
+            )
+            # assert_shape(pos_emb, (N, self.n_time, self.out_width))
+        else:
+            pos_emb = None
+        return start_emb, pos_emb
+
+
+class JukeboxPrior(nn.Module):
+    """
+    Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
+    on the timing, genre and artist, we use the LabelConditioner class To condition on the codes from the level above,
+    we use the MusicTokenConditioner class To condition on lyrics, we allow two types of priors:
+    - Separate Encoder Decoder: This is the usual encoder-decoder style transformer. The encoder transformer
+      autoregressively
+    models the lyrics, and we use its last layer to produce keys/values that are attened to by the decoder transformer
+    - Single Encoder Decoder: This is a simplification where we combine them into a single model. We merge the text
+      vocab
+    and VQ vocab into a single large vocab, and the lyric tokens and VQ tokens into a single longer sequence of tokens
+    which we autoregressively model together.
+    """
+
+    def __init__(self, config, level):
+        super().__init__()
+        vqvae_z_shapes = config.vqvae_z_shapes
+
+        def rescale(z_shape):
+            return (z_shape[0] * config.n_ctx[-level - 1] // vqvae_z_shapes[level][0],)
+
+        z_shapes = [rescale(z_shape) for z_shape in vqvae_z_shapes]
+        self.use_tokens = config.use_tokens[-level - 1]
+        self.n_tokens = config.n_tokens[-level - 1]
+        self.prime_loss_fraction = config.prime_loss_fraction[-level - 1]
+
+        self.copy_input = config.copy_input
+        if self.copy_input:
+            config.bins = config.l_bins
+
+        self.z_shapes = z_shapes
+        self.levels = len(self.z_shapes)
+
+        self.z_shape = self.z_shapes[level]
+
+        self.level = level
+        assert level < self.levels, f"Total levels {self.levels}, got level {level}"
+
+        self.l_bins = config.l_bins
+
+        prior_kwargs = dict(
+            input_shape=(config.n_ctx[-level - 1],),
+            bins=config.l_bins,
+            width=config.width[-level - 1],
+            depth=config.depth[-level - 1],
+            heads=config.n_heads[-level - 1],
+            attn_order=config.attn_order[-level - 1],
+            blocks=config.blocks,
+            spread=config.spread,
+            attn_dropout=config.attn_dropout,
+            resid_dropout=config.resid_dropout,
+            emb_dropout=config.emb_dropout,
+            zero_out=config.zero_out,
+            res_scale=config.res_scale,
+            pos_init=config.pos_init,
+            init_scale=config.init_scale[-level - 1],
+            m_attn=config.m_attn,  # m_mlp=config.m_mlp
+        )
+
+        if config.use_tokens and not config.single_enc_dec[-level - 1]:
+            prime_kwargs = dict(
+                bins=config.n_vocab,
+                width=config.prime_width[-level - 1],
+                depth=config.prime_depth[-level - 1],
+                heads=config.prime_heads,
+                attn_order=config.prime_attn_order[-level - 1],
+                blocks=config.prime_blocks,
+                spread=config.prime_spread,
+                attn_dropout=config.prime_attn_dropout,
+                resid_dropout=config.prime_resid_dropout,
+                emb_dropout=config.prime_emb_dropout,
+                zero_out=config.prime_zero_out,
+                res_scale=config.prime_res_scale,
+                pos_init=config.prime_pos_init,
+                init_scale=config.prime_init_scale[-level - 1],
+                m_attn=config.prime_m_attn,
+                m_mlp=config.prime_m_mlp,
+            )
+        else:
+            prime_kwargs = dict(bins=config.n_vocab)
+
+        x_cond_kwargs = dict(
+            out_width=config.width[-level - 1],
+            init_scale=config.init_scale[-level - 1],
+            width=config.cond_width[-level - 1],
+            depth=config.cond_depth[-level - 1],
+            m_conv=config.cond_m_conv,
+            dilation_growth_rate=config.cond_dilation_growth_rate[-level - 1],
+            dilation_cycle=config.cond_dilation_cycle[-level - 1],
+            zero_out=config.cond_zero_out,
+            res_scale=config.cond_res_scale,
+            checkpoint_res=config.cond_c_res[-level - 1],
+        )  # have to keep this else names wrong
+
+        y_cond_kwargs = dict(
+            out_width=config.width[-level - 1],
+            init_scale=config.init_scale[-level - 1],
+            y_bins=config.y_bins[-level - 1],
+            t_bins=config.t_bins,
+            sr=config.sr,
+            min_duration=config.min_duration,
+            max_duration=config.max_duration,
+            max_bow_genre_size=config.max_bow_genre_size,
+        )
+
+        # X conditioning
+        self.x_cond = level != (self.levels - 1)
+        self.cond_level = level + 1
+
+        # Y conditioning
+        self.y_cond = config.labels
+
+        self.single_enc_dec = config.single_enc_dec[-level - 1]
+        # X conditioning : conditioning on music tokens (either from audio or from previous levels )
+        if self.x_cond:
+            self.conditioner_blocks = nn.ModuleList()
+
+            def conditioner_block(_level):
+                return MusicTokenConditioner(
+                    input_shape=z_shapes[_level],
+                    bins=config.l_bins,
+                    down_t=config.downs_t[_level],
+                    stride_t=config.strides_t[_level],
+                    **x_cond_kwargs,
+                )
+
+            # if dist.get_rank() == 0: print(f"Conditioning on 1 above level(s)")
+            self.conditioner_blocks.append(conditioner_block(self.cond_level))
+
+        # Y conditioning : contioning on timing, genres, and artist
+        if self.y_cond:
+            self.n_time = self.z_shape[0]  # Assuming STFT=TF order and raw=T1 order, so T is first dim
+            self.y_emb = LabelConditioner(n_time=self.n_time, include_time_signal=not self.x_cond, **y_cond_kwargs)
+
+        # Lyric conditioning
+        if config.single_enc_dec[-level - 1]:
+            # Single encoder-decoder transformer
+            self.prior_shapes = [(self.n_tokens,), prior_kwargs.pop("input_shape")]
+            self.prior_bins = [prime_kwargs["bins"], prior_kwargs.pop("bins")]
+            self.prior_dims = [np.prod(shape) for shape in self.prior_shapes]
+            self.prior_bins_shift = np.cumsum([0, *self.prior_bins])[:-1]
+            self.prior_width = prior_kwargs["width"]
+            print(f"Creating cond. autoregress with prior bins {self.prior_bins}, ")
+            print(f"dims {self.prior_dims}, ")
+            print(f"shift {self.prior_bins_shift}")
+            print(f"input shape {sum(self.prior_dims)}")
+            print(f"input bins {sum(self.prior_bins)}")
+            print(f"Self copy is {self.copy_input}")
+
+            self.prime_loss_dims, self.gen_loss_dims = self.prior_dims[0], self.prior_dims[1]
+            self.total_loss_dims = self.prime_loss_dims + self.gen_loss_dims
+            self.prior = JukeboxConditionalAutoregressive(
+                input_shape=(sum(self.prior_dims),),
+                bins=sum(self.prior_bins),
+                x_cond=(self.x_cond or self.y_cond),
+                y_cond=True,
+                prime_len=self.prime_loss_dims,
+                **prior_kwargs,
+            )
+
+        else:
+            # Separate encoder-decoder transformer
+            if self.n_tokens != 0 and self.use_tokens:
+                prime_input_shape = (self.n_tokens,)
+                self.prime_loss_dims = np.prod(prime_input_shape)
+                self.prime_acts_width, self.prime_state_width = prime_kwargs["width"], prior_kwargs["width"]
+                self.prime_prior = JukeboxConditionalAutoregressive(
+                    input_shape=prime_input_shape, x_cond=False, y_cond=False, only_encode=True, **prime_kwargs
+                )
+                self.prime_state_proj = Conv1D(
+                    self.prime_acts_width, self.prime_state_width, init_scale=prime_kwargs["init_scale"]
+                )
+                self.prime_state_ln = LayerNorm(self.prime_state_width)
+                self.prime_bins = prime_kwargs["bins"]
+                self.prime_x_out = nn.Linear(self.prime_state_width, self.prime_bins, bias=False)
+                nn.init.normal_(self.prime_x_out.weight, std=0.02 * prior_kwargs["init_scale"])
+            else:
+                self.prime_loss_dims = 0
+            self.gen_loss_dims = np.prod(self.z_shape)
+            self.total_loss_dims = self.prime_loss_dims + self.gen_loss_dims
+            self.prior = JukeboxConditionalAutoregressive(
+                x_cond=(self.x_cond or self.y_cond),
+                y_cond=self.y_cond,
+                encoder_dims=self.prime_loss_dims,
+                merged_decoder=config.merged_decoder[-level - 1],
+                **prior_kwargs,
+            )
+
+        self.n_ctx = self.gen_loss_dims
+        self.downsamples = calculate_strides(config.strides_t, config.downs_t)
+        self.cond_downsample = self.downsamples[level + 1] if level != self.levels - 1 else None
+        self.raw_to_tokens = np.prod(self.downsamples[: level + 1])
+        self.sample_length = self.n_ctx * self.raw_to_tokens
+        # if the labels are used for training, the trainer will use it?
+        # This is probably were its gonna get a bit complicated
+
+        # if labels:
+        #     self.labels_v3 = labels_v3
+        #     self.labeller = Labeller(self.y_emb.max_bow_genre_size, self.n_tokens, self.sample_length, v3=self.labels_v3)
+        # else:
+        #     self.labeller = EmptyLabeller()
+
+        print(
+            f"Level:{level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample"
+            f" length:{self.sample_length}"
+        )
+
+    def get_y(self, labels, start, get_indices=False):
+        # labeler does not exist this should be removed
+        # if isinstance(self.labeller, EmptyLabeller):
+        #     return None
+
+        # y = labels["y"].clone()
+        y = labels.clone()
+
+        # Set sample_length to match this level
+        y[:, 2] = int(self.sample_length)
+
+        # Set offset
+        y[:, 1:2] = y[:, 1:2] + int(start * self.raw_to_tokens)
+        if get_indices:
+            indices = None
+            return y, indices  # here the indices should be the indices of the lyrics to take into account...
+        return y
+        # Set lyric tokens
+        indices = self.labeller.set_y_lyric_tokens(y, labels)
+        if get_indices:
+            return y, indices
+        else:
+            return y
+
+    def get_z_conds(self, zs, start, end):
+        if self.level != self.levels - 1:
+            assert start % self.cond_downsample == end % self.cond_downsample == 0
+            z_cond = zs[self.level + 1][:, start // self.cond_downsample : end // self.cond_downsample]
+            assert z_cond.shape[1] == self.n_ctx // self.cond_downsample
+            z_conds = [z_cond]
+        else:
+            z_conds = None
+        return z_conds
+
+    def prior_preprocess(self, xs, conds):
+        N = xs[0].shape[0]
+        for i in range(len(xs)):
+            x, _, dims = xs[i], self.prior_shapes[i], self.prior_dims[i]
+            bins, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])
+            # assert isinstance(x, torch.cuda.LongTensor), x
+            assert (0 <= x).all() and (x < bins).all()
+            # assert_shape(x, (N, *shape))
+            xs[i] = (xs[i] + bins_shift).view(N, -1)
+
+        for i in range(len(conds)):
+            cond, _, dims = conds[i], self.prior_shapes[i], self.prior_dims[i]
+            if cond is not None:
+                # assert_shape(cond, (N, dims, self.prior_width))
+                pass
+            else:
+                conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=xs[0].device)
+
+        return torch.cat(xs, dim=1), torch.cat(conds, dim=1)
+
+    def prior_postprocess(self, z):
+        N = z.shape[0]
+        dims = (self.prior_dims[0], z.shape[1] - self.prior_dims[0])
+        # xs = list(t.split(z, self.prior_dims, dim=1))
+        xs = list(torch.split(z, dims, dim=1))
+
+        for i in range(len(xs)):
+            # x, shape, dims, bins, bins_shift = xs[i], self.prior_shapes[i], self.prior_dims[i], self.prior_bins[i], self.prior_bins_shift[i]
+            # assert_shape(x, (N, dims))
+            shape = self.prior_shapes[i]
+            _, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])  # bins, -> _,
+            # xs[i] = (xs[i] - bins_shift).view(N, *shape) #view(N, -1, *shape[1:])
+            xs[i] = (xs[i] - bins_shift).view(N, -1, *shape[1:])
+            xs[i] = torch.clamp(
+                xs[i], min=0
+            )  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
+            # assert (xs[i] < bins).all(), f'rank: {dist.get_rank()}, bins: {bins}, dims {dims}, shape {shape}, prior_shape {self.prior_shapes}, bins_shift {bins_shift}, xs[i]: {xs[i]}'
+
+        return xs[-1]
+
+    def x_emb(self, z_conds):
+        z_conds = z_conds[: self.cond_level - self.level]
+        assert (
+            len(z_conds) == len(self.conditioner_blocks) == self.cond_level - self.level
+        ), f"Expected {len(z_conds)} == {len(self.conditioner_blocks)} == {self.cond_level} - {self.level}"
+        x_cond = None
+        for z_cond, conditioner_block in reversed(list(zip(z_conds, self.conditioner_blocks))):
+            x_cond = conditioner_block(z_cond, x_cond)
+        return x_cond
+
+    # should be removed as the vq-vae is no longer part of the prior
+    def encode(self, x, start_level=None, end_level=None, bs_chunks=1):
+        if start_level is None:
+            start_level = self.level
+        if end_level is None:
+            end_level = self.levels
+        # Get latents
+        with torch.no_grad():
+            zs = self.encoder(x, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+        return zs
+
+    # same as above, the va-vae is no longer part of the prior
+    def decode(self, zs, start_level=None, end_level=None, bs_chunks=1):
+        if start_level is None:
+            start_level = self.level
+        if end_level is None:
+            end_level = self.levels
+
+        assert len(zs) == end_level - start_level
+        with torch.no_grad():
+            x_out = self.decoder(zs, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+        return x_out
+
+    def get_cond(self, z_conds, y):
+        if y is not None:
+            # assert (
+            #     y.shape[1] == 4 + self.y_emb.max_bow_genre_size + self.n_tokens
+            # ), f"Expected {4} + {self.y_emb.max_bow_genre_size} + {self.n_tokens}, got {y.shape[1]}"
+            # removed the labeler so there are no y_emb
+            n_labels = y.shape[1] - self.n_tokens
+            y, prime = y[:, :n_labels], y[:, n_labels:]
+        else:
+            y, prime = None, None
+        y_cond, y_pos = self.y_emb(y) if self.y_cond else (None, None)
+        x_cond = self.x_emb(z_conds) if self.x_cond else y_pos
+        return x_cond, y_cond, prime
+
+    def sample(
+        self,
+        n_samples,
+        z=None,
+        z_conds=None,
+        y=None,
+        fp16=False,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        chunk_size=None,
+        sample_tokens=None,
+    ):
+        N = n_samples
+        if z is not None:
+            assert z.shape[0] == N, f"Expected shape ({N},**), got shape {z.shape}"
+        if y is not None:
+            assert y.shape[0] == N, f"Expected shape ({N},**), got shape {y.shape}"
+        if z_conds is not None:
+            for z_cond in z_conds:
+                assert z_cond.shape[0] == N, f"Expected shape ({N},**), got shape {z_cond.shape}"
+
+        no_past_context = z is None or z.shape[1] == 0
+        # if dist.get_rank() == 0:
+        #     name = {True: 'Ancestral', False: 'Primed'}[no_past_context]
+        #     print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
+        name = {True: "Ancestral", False: "Primed"}[no_past_context]
+        print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
+
+        with torch.no_grad():
+            # Currently x_cond only uses immediately above layer
+            x_cond, y_cond, prime = self.get_cond(z_conds, y)
+            if self.single_enc_dec:
+                # assert chunk_size % self.prime_loss_dims == 0. TODO: Check if needed
+                if no_past_context:
+                    z, x_cond = self.prior_preprocess([prime], [None, x_cond])
+                else:
+                    z, x_cond = self.prior_preprocess([prime, z], [None, x_cond])
+                if sample_tokens is not None:
+                    sample_tokens += self.n_tokens
+                z = self.prior.primed_sample(
+                    n_samples,
+                    z,
+                    x_cond,
+                    y_cond,
+                    fp16=fp16,
+                    temp=temp,
+                    top_k=top_k,
+                    top_p=top_p,
+                    chunk_size=chunk_size,
+                    sample_tokens=sample_tokens,
+                )
+                z = self.prior_postprocess(z)
+            else:
+                encoder_kv = self.get_encoder_kv(prime, fp16=fp16, sample=True)
+                if no_past_context:
+                    z = self.prior.sample(
+                        n_samples,
+                        x_cond,
+                        y_cond,
+                        encoder_kv,
+                        fp16=fp16,
+                        temp=temp,
+                        top_k=top_k,
+                        top_p=top_p,
+                        sample_tokens=sample_tokens,
+                    )
+                else:
+                    z = self.prior.primed_sample(
+                        n_samples,
+                        z,
+                        x_cond,
+                        y_cond,
+                        encoder_kv,
+                        fp16=fp16,
+                        temp=temp,
+                        top_k=top_k,
+                        top_p=top_p,
+                        chunk_size=chunk_size,
+                        sample_tokens=sample_tokens,
+                    )
+            if sample_tokens is None:
+                # assert_shape(z, (N, *self.z_shape))
+                pass
+        return z
+
+    def get_encoder_kv(self, prime, fp16=False, sample=False):
+        if self.n_tokens != 0 and self.use_tokens:
+            if sample:
+                self.prime_prior = self.prime_prior.to(prime.device)
+                # self.prime_prior.cuda()
+                pass
+            # N = prime.shape[0]
+            prime_acts = self.prime_prior(prime, None, None, None, fp16=fp16)
+            # assert_shape(prime_acts, (N, self.prime_loss_dims, self.prime_acts_width))
+            assert prime_acts.dtype == torch.float, f"Expected  torch.float, got {prime_acts.dtype}"
+            encoder_kv = self.prime_state_ln(self.prime_state_proj(prime_acts))
+            assert encoder_kv.dtype == torch.float, f"Expected  torch.float, got {encoder_kv.dtype}"
+            if sample:
+                self.prime_prior.cpu()
+                if fp16:
+                    encoder_kv = encoder_kv.half()
+        else:
+            encoder_kv = None
+        return encoder_kv
+
+    def get_prime_loss(self, encoder_kv, prime_t):
+        if self.use_tokens:
+            encoder_kv = encoder_kv.float()
+            encoder_kv = self.prime_x_out(encoder_kv)
+            prime_loss = nn.functional.cross_entropy(encoder_kv.view(-1, self.prime_bins), prime_t.view(-1)) / np.log(
+                2.0
+            )
+        else:
+            prime_loss = torch.tensor(0.0, device="cuda")
+        return prime_loss
+
+    def z_forward(self, z, z_conds=[], y=None, fp16=False, get_preds=False, get_attn_weights=False):
+        """
+        Arguments:
+            get_attn_weights (bool or set): Makes forward prop dump
+                self-attention softmaxes to self.prior.transformer.ws. Either a set of layer indices indicating which
+                layers to store, or a boolean value indicating whether to dump all.
+        """
+        assert isinstance(get_attn_weights, (bool, set))
+        if get_attn_weights:
+            self.prior.transformer.set_record_attn(get_attn_weights)
+        x_cond, y_cond, prime = self.get_cond(z_conds, y)
+        if self.copy_input:
+            prime = z[:, : self.n_tokens]
+        if self.single_enc_dec:
+            z, x_cond = self.prior_preprocess([prime, z], [None, x_cond])
+            (prime_loss, gen_loss), preds = self.prior(
+                z, x_cond, y_cond, fp16=fp16, get_sep_loss=True, get_preds=get_preds
+            )
+        else:
+            encoder_kv = self.get_encoder_kv(prime, fp16=fp16)
+            prime_loss = self.get_prime_loss(encoder_kv, prime)
+            gen_loss, preds = self.prior(z, x_cond, y_cond, encoder_kv, fp16=fp16, get_preds=get_preds)
+        loss = (self.prime_loss_fraction * prime_loss * self.prime_loss_dims / self.total_loss_dims) + (
+            gen_loss * self.gen_loss_dims / self.total_loss_dims
+        )
+        metrics = dict(
+            bpd=gen_loss.clone().detach(), prime_loss=prime_loss.clone().detach(), gen_loss=gen_loss.clone().detach()
+        )
+        if get_preds:
+            metrics["preds"] = preds.clone().detach()
+        if get_attn_weights:
+            ws = self.prior.transformer.ws
+            self.prior.transformer.set_record_attn(False)
+            return ws
+        else:
+            return loss, metrics
+
+    def forward(self, x, y=None, fp16=False, decode=False, get_preds=False):
+        bs = x.shape[0]
+        z, *z_conds = self.encode(x, bs_chunks=bs)
+        loss, metrics = self.z_forward(z=z, z_conds=z_conds, y=y, fp16=fp16, get_preds=get_preds)
+        if decode:
+            x_out = self.decode([z, *z_conds])
+        else:
+            x_out = None
+        return x_out, loss, metrics
+
+
+class JukeboxPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = JukeboxConfig
+    # load_tf_weights = load_tf_weights_in_jukebox
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        # Reinitialize selected weights subject to the Jukebox Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if "c_proj" in name and "weight" in name:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, JukeboxModel):
+            module.gradient_checkpointing = value
+
+
+JUKEBOX_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`JukeboxConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+def split_batch(obj, n_samples, split_size):
+    n_passes = (n_samples + split_size - 1) // split_size
+    if isinstance(obj, torch.Tensor):
+        return torch.split(obj, split_size, dim=0)
+    elif isinstance(obj, list):
+        return list(zip(*[torch.split(item, split_size, dim=0) for item in obj]))
+    elif obj is None:
+        return [None] * n_passes
+    else:
+        raise TypeError("Unknown input type")
+
+
+# Break total_length into hops/windows of size n_ctx separated by hop_length
+def get_starts(total_length, n_ctx, hop_length):
+    starts = []
+    for start in range(0, total_length - n_ctx + hop_length, hop_length):
+        if start + n_ctx >= total_length:
+            # Last hop could be smaller, we make it n_ctx to maximise context
+            start = total_length - n_ctx
+        starts.append(start)
+    return starts
+
+
+def save_wav(fname, aud, sr):
+    import soundfile
+
+    # clip before saving?
+    aud = torch.clamp(aud, -1, 1).cpu().numpy()
+    for i in list(range(aud.shape[0])):
+        soundfile.write(f"{fname}/item_{i}.wav", aud[i], samplerate=sr, format="wav")
+
+
+def get_alignment(x, zs, labels, prior, level, fp16, hps):
+    level = level - 1  # Top level used
+    n_ctx, n_tokens = prior.n_ctx, prior.n_tokens
+    z = zs[level]
+    bs, total_length = z.shape[0], z.shape[1]
+    if total_length < n_ctx:
+        padding_length = n_ctx - total_length
+        z = torch.cat([z, torch.zeros(bs, n_ctx - total_length, dtype=z.dtype, device=z.device)], dim=1)
+        total_length = z.shape[1]
+    else:
+        padding_length = 0
+
+    hop_length = int(hps.hop_fraction[-level - 1] * prior.n_ctx)
+    alignment_head, alignment_layer = hps.alignment_head[-level - 1], hps.alignment_layer[-level - 1]
+    attn_layers = set([alignment_layer])
+    alignment_hops = {}
+    indices_hops = {}
+    prior = prior.to(zs.device)
+    # prior.cuda()
+    empty_cache()
+    for start in get_starts(total_length, n_ctx, hop_length):
+        end = start + n_ctx
+
+        # set y offset, sample_length and lyrics tokens
+        y, indices_hop = prior.get_y(labels, start, get_indices=True)
+        # assert len(indices_hop) == bs
+        for indices in indices_hop:
+            assert len(indices) == n_tokens
+
+        z_bs = torch.chunk(z, bs, dim=0)
+        y_bs = torch.chunk(y, bs, dim=0)
+        w_hops = []
+        for z_i, y_i in zip(z_bs, y_bs):
+            w_hop = prior.z_forward(z_i[:, start:end], [], y_i, fp16=fp16, get_attn_weights=attn_layers)
+            assert len(w_hop) == 1
+            w_hops.append(w_hop[0][:, alignment_head])
+            del w_hop
+        w = torch.cat(w_hops, dim=0)
+        del w_hops
+        # assert_shape(w, (bs, n_ctx, n_tokens))
+        alignment_hop = w.float().cpu().numpy()
+        # assert_shape(alignment_hop, (bs, n_ctx, n_tokens))
+        del w
+
+        # alignment_hop has shape (bs, n_ctx, n_tokens)
+        # indices_hop is a list of len=bs, each entry of len hps.n_tokens
+        indices_hops[start] = indices_hop
+        alignment_hops[start] = alignment_hop
+    prior.cpu()
+    empty_cache()
+
+    # Combine attn for each hop into attn for full range
+    # Use indices to place them into correct place for corresponding source tokens
+    alignments = []
+    for item in range(bs):
+        # Note each item has different length lyrics
+        full_tokens = labels["info"][item]["full_tokens"]
+        alignment = np.zeros((total_length, len(full_tokens) + 1))
+        for start in reversed(get_starts(total_length, n_ctx, hop_length)):
+            end = start + n_ctx
+            alignment_hop = alignment_hops[start][item]
+            indices = indices_hops[start][item]
+            assert len(indices) == n_tokens
+            assert alignment_hop.shape == (n_ctx, n_tokens)
+            alignment[start:end, indices] = alignment_hop
+        alignment = alignment[: total_length - padding_length, :-1]  # remove token padding, and last lyric index
+        alignments.append(alignment)
+    return alignments
+
+
+@add_start_docstrings(
+    "The bare JUKEBOX Model from which you can sample",
+    JUKEBOX_START_DOCSTRING,
+)
+class JukeboxModel(JukeboxPreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.vqvae = VQVAE(config)
+        config.vqvae_z_shapes = self.vqvae.z_shapes
+        self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
+    def sample_partial_window(self, zs, labels, sampling_kwargs, level, tokens_to_sample, hps):
+        prior = self.priors[level]
+        z = zs[level]
+        n_ctx = prior.n_ctx
+        current_tokens = z.shape[1]
+        if current_tokens < n_ctx - tokens_to_sample:
+            sampling_kwargs["sample_tokens"] = current_tokens + tokens_to_sample
+            start = 0
+        else:
+            sampling_kwargs["sample_tokens"] = n_ctx
+            start = current_tokens - n_ctx + tokens_to_sample
+
+        return self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
+
+    # Sample a single window of length=n_ctx at position=start on level=level
+    def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
+        prior = self.priors[level]
+        n_samples = hps.n_samples
+        n_ctx = prior.n_ctx
+        end = start + n_ctx
+
+        # get z already sampled at current level
+        z = zs[level][:, start:end]
+
+        if "sample_tokens" in sampling_kwargs:
+            # Support sampling a window shorter than n_ctx
+            sample_tokens = sampling_kwargs["sample_tokens"]
+        else:
+            sample_tokens = end - start
+        conditioning_tokens, new_tokens = z.shape[1], sample_tokens - z.shape[1]
+
+        print(
+            f"Sampling {sample_tokens} tokens for [{start},{start+sample_tokens}]. Conditioning on"
+            f" {conditioning_tokens} tokens"
+        )
+
+        if new_tokens <= 0:
+            # Nothing new to sample
+            return zs
+
+        # get z_conds from level above
+        z_conds = prior.get_z_conds(zs, start, end)
+        # if there are no levels above should return None!
+
+        # set y offset, sample_length and lyrics tokens
+        y = prior.get_y(labels, start)
+
+        empty_cache()
+        max_batch_size = 2
+        max_batch_size = sampling_kwargs["max_batch_size"]
+        del sampling_kwargs["max_batch_size"]
+
+        z_list = split_batch(z, n_samples, max_batch_size)
+        z_conds_list = split_batch(z_conds, n_samples, max_batch_size)
+        y_list = split_batch(y, n_samples, max_batch_size)
+        z_samples = []
+        for z_i, z_conds_i, y_i in zip(z_list, z_conds_list, y_list):
+            z_samples_i = prior.sample(n_samples=z_i.shape[0], z=z_i, z_conds=z_conds_i, y=y_i, **sampling_kwargs)
+            z_samples.append(z_samples_i)
+        z = torch.cat(z_samples, dim=0)
+
+        sampling_kwargs["max_batch_size"] = max_batch_size
+
+        # Update z with new sample
+        z_new = z[:, -new_tokens:]
+        zs[level] = torch.cat([zs[level], z_new], dim=1)
+        return zs
+
+    # Sample total_length tokens at level=level with hop_length=hop_length
+    def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_length, hps):
+        # print(f"Sampling level {level}")
+        print(f"Sampling level {level}")
+        prior = self.priors[level]
+        if total_length >= prior.n_ctx:
+            for start in get_starts(total_length, prior.n_ctx, hop_length):
+                zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
+        else:
+            zs = self.sample_partial_window(zs, labels, sampling_kwargs, level, total_length, hps)
+        return zs
+
+    # Sample multiple levels
+    def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
+        alignments = None
+        for level in reversed(sample_levels):
+            prior = self.priors[level]
+            prior = prior.to(zs[0].device)
+            empty_cache()
+
+            # Set correct total_length, hop_length, labels and sampling_kwargs for level
+            assert (
+                hps.sample_length % prior.raw_to_tokens == 0
+            ), f"Expected sample_length {hps.sample_length} to be multiple of {prior.raw_to_tokens}"
+            total_length = hps.sample_length // prior.raw_to_tokens
+            hop_length = int(hps.hop_fraction[-level - 1] * prior.n_ctx)
+
+            # TODO either mask them or ddo better
+            if level != len(sample_levels) - 1:
+                labels_level = labels[level][0][: 4 + hps.max_bow_genre_size].unsqueeze(0)
+                zs = self.sample_level(zs, labels_level, sampling_kwargs[level], level, total_length, hop_length, hps)
+            else:
+                zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
+
+            prior.to(zs[0].device)
+            empty_cache()
+
+            # Decode sample
+            with torch.no_grad():
+                x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
+
+            # if dist.get_world_size() > 1:
+            #     logdir = f"{hps.name}_rank_{dist.get_rank()}/level_{level}"
+            # else:
+            #     logdir = f"{hps.name}/level_{level}"
+            logdir = f"{hps.name}/level_{level}"
+            if not os.path.exists(logdir):
+                os.makedirs(logdir)
+            torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
+            save_wav(logdir, x, hps.sr)
+            if (
+                alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
+            ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):
+                # either use level which will be the given lovel or use the total nb of levels?
+                # alignments = get_alignment(x, zs, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
+                pass  # TODO this is a really dirty fix
+        return zs
+
+    # Generate ancestral samples given a list of artists and genres
+    def ancestral_sample(self, labels, sampling_kwargs, hps):
+        priors = self.priors
+        sample_levels = list(range(len(priors)))
+        zs = [torch.zeros(hps.n_samples, 0, dtype=torch.long, device=self.device) for _ in range(len(priors))]
+        zs = self._sample(zs, labels, sampling_kwargs, sample_levels, hps)
+        return zs
+
+    # Continue ancestral sampling from previously saved codes
+    def continue_sample(self, zs, labels, sampling_kwargs, hps):
+        sample_levels = list(range(len(self.priors)))
+        zs = self._sample(zs, labels, sampling_kwargs, sample_levels, hps)
+        return zs
+
+    # Upsample given already generated upper-level codes
+    def upsample(self, zs, labels, sampling_kwargs, hps):
+        sample_levels = list(range(len(self.priors) - 1))
+        zs = self._sample(zs, labels, sampling_kwargs, sample_levels, hps)
+        return zs
+
+    # Prompt the model with raw audio input (dimension: NTC) and generate continuations
+    def primed_sample(self, x, labels, sampling_kwargs, hps):
+        sample_levels = list(range(len(self.priors)))
+        zs = self.priors[-1].encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
+        zs = self._sample(zs, labels, sampling_kwargs, sample_levels, hps)
+        return zs
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
new file mode 100644
index 0000000000000..7b2185a04887a
--- /dev/null
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -0,0 +1,346 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI Jukebox."""
+
+
+import json
+import os
+from json.encoder import INFINITY
+from typing import Any, Dict, List, Optional, Tuple
+
+import regex as re
+from tokenizers import normalizers
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "jukebox": "https://huggingface.co/ArthurZ/jukebox/blob/main/vocab.json",
+    }
+}
+
+PRETRAINED_LYRIC_TOKENS_SIZES = {
+    "jukebox": 512,  # corresonds to the dummy-model ?
+}
+
+
+class JukeboxTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
+        - Artists, unique ids are associated to each artist from the provided dictionary.
+        - Genres, unique ids are associated to each genre from the provided dictionary.
+        - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
+        vocabulary.
+
+    This tokenizer is straight forward and does not require trainingg. It should be able to process a different number of inputs:
+    as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
+
+    Depending on the number of genres on which the model should be conditioned (`n_genres`).
+    ```
+    >>> from transformers import JukeboxTokenizer
+    >>> tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
+    >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
+    [[6785],[546], [0, 0, 0, 0, 0, 0, 0,   41,       38,       30,
+                    77,       46,       41,       49,       40,
+                    77,       44,       41,       27,       30] ]
+    >>> tokenizer("Alan Jackson", "Country Rock")['input_ids']
+    [6785],[546]]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    If nothing is provided, the genres and the artist will either be selected randomly or set to None
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to:
+    this superclass for more information regarding those methods.
+
+    # TODO: the original paper should support composing from 2 or more artists and genres.
+    However the code does not allow that and only supports composing from various genres.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file which should contain a dictionnary where the keys are 'artist', 'genre' and
+            'lyrics' and the values are their corresponding vocabulary files.
+        max_n_lyric_tokens (`int`, `optional`, defaults to 512):
+            Maximum number of lyric tokens to keep.
+        n_genres (`int`, `optional`, defaults to 1):
+            Maximum number of genres to use for composition.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_lyric_input_size = PRETRAINED_LYRIC_TOKENS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(self, vocab_file, max_n_lyric_tokens=512, n_genres=5, unk_token="<|endoftext|>", **kwargs):
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        super().__init__(
+            unk_token=unk_token,
+            **kwargs,
+        )
+        self.max_n_lyric_tokens = max_n_lyric_tokens
+        self.n_genres = n_genres
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            vocabulary = json.load(vocab_handle)
+            self.artists_encoder = vocabulary["artists"]
+            self.genres_encoder = vocabulary["genres"]
+            self.lyrics_encoder = vocabulary["lyrics"]
+
+        self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")  # FIXME: should be an argument?
+
+        self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
+        self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
+        self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
+
+    @property
+    def vocab_size(self):
+        return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder)
+
+    def get_vocab(self):
+        return dict(self.artists_encoder, self.genres_encoder, self.lyrics_encoder)
+
+    def get_relevant_lyric_tokens(self, full_tokens, total_length, offset, duration):
+        """Extract only the relevant tokens based on the character position. A total of
+        `max_n_lyric_tokens` tokens will be returned. If the provided token sequence is smaller, it will be padded,
+        othewise, only characters ranging from the midpoint - `max_n_lyric_tokens//2` to the midpoint +
+        `max_n_lyric_tokens//2` will be returned. This *focuses* on the most relevant tokens (in time) for the
+        sequence.
+
+        Args: # TODO : args to prettify
+            full_tokens (`_type_`):
+                _description_
+            total_length (`_type_`):
+                _description_
+            offset (`_type_`):
+                _description_
+            duration (`_type_`):
+                _description_
+        """
+        if len(full_tokens) < self.max_n_lyric_tokens:
+            tokens = [0] * (self.max_n_lyric_tokens - len(full_tokens)) + full_tokens
+            indices = [-1] * (self.max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
+        else:
+            assert 0 <= offset < total_length
+            midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
+            midpoint = min(
+                max(midpoint, self.max_n_lyric_tokens // 2), len(full_tokens) - self.max_n_lyric_tokens // 2
+            )
+            tokens = full_tokens[midpoint - self.max_n_lyric_tokens // 2 : midpoint + self.max_n_lyric_tokens // 2]
+            indices = list(range(midpoint - self.max_n_lyric_tokens // 2, midpoint + self.max_n_lyric_tokens // 2))
+        assert len(tokens) == self.max_n_lyric_tokens, f"Expected length {self.max_n_lyric_tokens}, got {len(tokens)}"
+        assert (
+            len(indices) == self.max_n_lyric_tokens
+        ), f"Expected length {self.max_n_lyric_tokens}, got {len(indices)}"
+        assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
+        return tokens
+
+    def _convert_token_to_id(self, artist, genres, lyrics, total_length, offset, duration):
+        """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
+        The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
+        the lyrics token sequence.
+
+        Args:
+            artist (`_type_`):
+                _description_
+            genre (`_type_`):
+                _description_
+            lyrics (`_type_`):
+                _description_
+            total_length (`_type_`):
+                _description_
+            offset (`_type_`):
+                _description_
+            duration (`_type_`):
+                _description_
+        """
+        artists_id = self.artists_encoder.get(artist)
+        genres_ids = [self.genres_encoder.get(genre) for genre in genres]
+        lyric_ids = [self.lyrics_encoder.get(character) for character in lyrics]
+        lyric_ids = self.get_relevant_lyric_tokens(lyric_ids, total_length, offset, duration)
+        return artists_id, genres_ids, lyric_ids
+
+    def _tokenize(self, lyrics):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens. Only the lytrics are split into character for the character-based vocabulary.
+        """
+        # only lyrics are not tokenized, but character based is easily handled
+        return [character for character in lyrics]
+
+    def tokenize(self, artist, genre, lyrics, **kwargs):
+        """
+        Converts three strings in a 3 sequence of tokens using the tokenizer
+
+        Args:
+            artist (`_type_`):
+                _description_
+            genre (`_type_`):
+                _description_
+            lyrics (`_type_`):
+                _description_
+        """
+        artist, genre, lyrics, kwargs = self.prepare_for_tokenization(artist, genre, lyrics, **kwargs)
+        # TODO deal with the kwargs here
+        lyrics = self._tokenize(lyrics)
+        return artist, genre, lyrics
+
+    def prepare_for_tokenization(
+        self, artist: str, genres: str, lyrics: str, is_split_into_words: bool = False, **kwargs
+    ) -> Tuple[str, str, str, Dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
+
+        Args:
+            artist (`str`):
+                The artist name to prepare. This will mostly lower the string
+            genres (`str`):
+                The gnere name to prepare. This will mostly lower the string.
+            lyrics (`str`):
+                The lyrics to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+            kwargs:
+                Keyword arguments to use for the tokenization. #TODO v3 could be handled here
+
+        Returns:
+            `Tuple[str, str, str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+        """
+        artist = self._normalize(artist)
+        genres = self._normalize(genres).split("_")  # split is for the full dictionnary with combined genres
+
+        lyrics = normalizers.BertNormalizer().normalize_str(lyrics)
+        lyrics = lyrics.replace("\\", "\n")
+        lyrics = self.out_of_vocab.sub("", lyrics)
+        return artist, genres, lyrics, kwargs
+
+    def _normalize(self, text: str) -> str:
+        """Normalizes the input text. This process is for the genres and the artit
+
+        Args:
+            text (`str`):
+                Artist or Genre string to normalize
+        """
+        import re
+
+        accepted = frozenset(
+            [chr(i) for i in range(ord("a"), ord("z") + 1)]
+            + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
+            + [chr(i) for i in range(ord("0"), ord("9") + 1)]
+        )
+
+        rex = re.compile(r"_+")
+        text = "".join([c if c in accepted else "_" for c in text.lower()])
+        text = rex.sub("_", text).strip("_")
+        return text
+
+    def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
+        """Converts an index (integer) in a token (str) using the vocab.
+        Args:
+            artists_index (`_type_`):
+                _description_
+            genres_index (`_type_`):
+                _description_
+            lyric_index (`_type_`):
+                _description_
+        """
+        artist = self.artists_decoder.get(artists_index)
+        genres = [self.genres_decoder.get(genre) for genre in genres_index]
+        lyrics = [self.lyrics_decoder.get(character) for character in lyric_index]
+        return artist, genres, lyrics
+
+    def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
+        return " ".join(lyrics)
+
+    # TODO : should add_token be implemeted for artists, genres and lyrics? Should it have
+    # a type argument to add an artist token with self.getattr('artist') ?
+    # TODO : is a call function required ?
+
+    def __call__(self, artist, genres, lyrics, total_length, sample_length, offset, duration):
+        """Convert the raw string to token ids
+
+        Args:
+            artist (`_type_`):
+                _description_
+            genre (`_type_`):
+                _description_
+            lyrics (`_type_`):
+                _description_
+            total_length (`_type_`):
+                _description_
+            sample_length (`_type_`):
+                _description_
+            offset (`_type_`):
+                _description_
+            duration (`_type_`):
+                _description_
+        """
+        input_ids = [total_length, offset, sample_length]
+        artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
+        artists_id, genres_ids, lyric_ids = self._convert_token_to_id(
+            artists_tokens, genres_tokens, lyrics_tokens, total_length, offset, duration
+        )
+        input_ids += [artists_id] + genres_ids + lyric_ids
+        attention_masks = [-INFINITY] * (self.max_n_lyric_tokens - len(lyrics_tokens)) + [0] * len(lyrics_tokens)
+        return {"input_ids": input_ids, "attention_masks": attention_masks}
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Saves the tokenizer's vocabulary dictionnary to the provided save_directory.
+
+        Args:
+            save_directory (`str`):
+                _description_
+            filename_prefix (`Optional[str]`, *optional*, defaults to None):
+                _description_
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(
+                json.dumps(
+                    {"artists": self.artists_encoder, "genres": self.genres_encoder, "lyrics": self.lyrics_encoder},
+                    ensure_ascii=False,
+                )
+            )
+
+        return (vocab_file,)
diff --git a/src/transformers/models/jukebox/tokenization_jukebox_fast.py b/src/transformers/models/jukebox/tokenization_jukebox_fast.py
new file mode 100644
index 0000000000000..f3948e0ef2ef8
--- /dev/null
+++ b/src/transformers/models/jukebox/tokenization_jukebox_fast.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+
+import json
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+from tokenizers import pre_tokenizers
+
+from ...tokenization_utils_base import BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_jukebox import JukeboxTokenizer
+
+
+if TYPE_CHECKING:
+    from transformers.pipelines.conversational import Conversation
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "jukebox": "https://huggingface.co/jukebox/resolve/main/vocab.json",
+    },
+    "tokenizer_file": {"jukebox": "https://huggingface.co/jukebox/resolve/main/tokenizer.json"},
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"jukebox": 1024}
+
+
+class JukeboxTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" Jukebox tokenizer, backed by HuggingFace's tokenizers library. Jukebox can be conditioned on 3
+    different inputs :
+        - Artists, unique ids are associated to each artist from the provided dictionary.
+        - Genres, unique ids are associated to each genre from the provided dictionary.
+        - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
+        vocabulary.
+
+    This tokenizer is straight forward and does not require trainingg. It should be able to process a different number
+    of inputs: as the conditioning of the model can be done on the three different queries. If None is provided,
+    defaults values will be used.:
+
+    ```
+    >>> from transformers import JukeboxTokenizer
+    >>> tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
+    >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
+    [15496, 995]
+    >>> tokenizer("Alan Jackson", "Country Rock")['input_ids']
+    [15496, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    If nothing is provided, the genres and the artist will either be selected randomly or set to None
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
+    to: this superclass for more information regarding those methods.
+
+    Args:
+        artitst_vocab_file (`str`):
+            Path to the vocabulary file which should contain a dictionnary where the keys are 'artist', 'genre' and
+            'lyrics' and the values are their corresponding vocabulary files.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = JukeboxTokenizer
+
+    def __init__(
+        self, vocab_file=None, tokenizer_file=None, unk_token="<|endoftext|>", add_prefix_space=False, **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        # TODO: should it be using WordLevel tokenizer ? Don't really know how that works yet
+        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
+        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
+            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
+            pre_tok_state["add_prefix_space"] = add_prefix_space
+            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
+
+        self.add_prefix_space = add_prefix_space
+
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+        """This corresponds to DialoGPT variants of models."""
+        input_ids = []
+        for is_user, text in conversation.iter_texts():
+            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
+
+        if len(input_ids) > self.model_max_length:
+            input_ids = input_ids[-self.model_max_length :]
+        return input_ids
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 847e7d87abee4..e1d968dcecca3 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2377,6 +2377,27 @@ def load_tf_weights_in_imagegpt(*args, **kwargs):
     requires_backends(load_tf_weights_in_imagegpt, ["torch"])
 
 
+JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class JukeboxModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class JukeboxPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def load_tf_weights_in_jukebox(*args, **kwargs):
+    requires_backends(load_tf_weights_in_jukebox, ["torch"])
+
+
 LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 00989dc0d12a4..286f0833420c9 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -66,6 +66,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
+class JukeboxTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
 class LayoutXLMTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
diff --git a/tests/models/jukebox/__init__.py b/tests/models/jukebox/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
new file mode 100644
index 0000000000000..8e688731aaf85
--- /dev/null
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -0,0 +1,1346 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import timeit
+import unittest
+
+import numpy as np
+
+from transformers import JukeboxConfig, is_torch_available
+from transformers.trainer_utils import set_seed
+
+
+# from datasets import load_dataset
+
+
+# from transformers.testing_utils import require_torch, slow, torch_device
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import JukeboxModel, JukeboxTokenizer  # ,JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+class JukeboxModelTest(unittest.TestCase):
+    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
+
+    # @slow
+    def test_model(self):
+        set_seed(0)
+
+        config = JukeboxConfig(
+            n_ctx=(256, 256, 256),
+            width=[128, 64, 32],
+            depth=[2, 2, 2],
+            priors_width=[128, 64, 32],
+            cond_width=[128, 128, 64],
+            l_bins=128,
+            vq_vae_codebook_dimension=128,
+            vq_vae_emmbedding_width=128,
+            sr=44100,
+            attn_order=[12, 2, 2],
+            n_heads=[2, 1, 1],
+            t_bins=64,
+            single_enc_dec=[True, False, False],
+            labels=True,
+            n_vocab=79,
+            sample_length=44032
+            # allows the use of label conditionning. Has to be
+            # True if the single_enc_dec is set to true apparently
+            # ntokens also have to be set to the nb of lyric tokens
+        )
+
+        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-dummy").eval()
+        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
+
+        # Checks
+
+        import random
+
+        seed = 0
+        random.seed(seed)
+        np.random.seed(seed)
+
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+        sample = model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 514 * [0]]).long(), chunk_size=32)
+
+        expected_samples = torch.Tensor(
+            [
+                [
+                    121,
+                    67,
+                    16,
+                    111,
+                    54,
+                    84,
+                    0,
+                    0,
+                    41,
+                    0,
+                    14,
+                    0,
+                    0,
+                    49,
+                    20,
+                    12,
+                    5,
+                    0,
+                    58,
+                    83,
+                    0,
+                    61,
+                    0,
+                    29,
+                    0,
+                    36,
+                    42,
+                    62,
+                    75,
+                    0,
+                    88,
+                    51,
+                    0,
+                    0,
+                    20,
+                    110,
+                    39,
+                    20,
+                    85,
+                    0,
+                    0,
+                    0,
+                    76,
+                    0,
+                    32,
+                    17,
+                    99,
+                    0,
+                    127,
+                    103,
+                    78,
+                    0,
+                    0,
+                    125,
+                    82,
+                    0,
+                    38,
+                    74,
+                    0,
+                    41,
+                    38,
+                    0,
+                    0,
+                    127,
+                    45,
+                    0,
+                    2,
+                    99,
+                    0,
+                    88,
+                    84,
+                    86,
+                    5,
+                    70,
+                    0,
+                    0,
+                    0,
+                    0,
+                    23,
+                    0,
+                    0,
+                    5,
+                    0,
+                    0,
+                    3,
+                    28,
+                    47,
+                    1,
+                    32,
+                    0,
+                    9,
+                    98,
+                    111,
+                    0,
+                    66,
+                    0,
+                    0,
+                    0,
+                    59,
+                    48,
+                    0,
+                    123,
+                    61,
+                    37,
+                    13,
+                    121,
+                    24,
+                    122,
+                    101,
+                    0,
+                    68,
+                    13,
+                    31,
+                    0,
+                    57,
+                    0,
+                    24,
+                    13,
+                    85,
+                    0,
+                    0,
+                    68,
+                    0,
+                    105,
+                    0,
+                    105,
+                    0,
+                    50,
+                    0,
+                    0,
+                    64,
+                    0,
+                    14,
+                    103,
+                    0,
+                    0,
+                    0,
+                    77,
+                    26,
+                    33,
+                    0,
+                    79,
+                    55,
+                    57,
+                    0,
+                    37,
+                    0,
+                    0,
+                    79,
+                    53,
+                    0,
+                    111,
+                    83,
+                    58,
+                    41,
+                    70,
+                    1,
+                    28,
+                    109,
+                    56,
+                    0,
+                    98,
+                    80,
+                    0,
+                    100,
+                    62,
+                    126,
+                    0,
+                    0,
+                    23,
+                    0,
+                    0,
+                    43,
+                    114,
+                    23,
+                    44,
+                    0,
+                    68,
+                    53,
+                    0,
+                    0,
+                    84,
+                    0,
+                    0,
+                    0,
+                    4,
+                    123,
+                    0,
+                    0,
+                    99,
+                    36,
+                    78,
+                    0,
+                    0,
+                    45,
+                    16,
+                    75,
+                    111,
+                    95,
+                    62,
+                    36,
+                    0,
+                    52,
+                    92,
+                    33,
+                    71,
+                    3,
+                    0,
+                    110,
+                    0,
+                    0,
+                    0,
+                    124,
+                    0,
+                    0,
+                    0,
+                    2,
+                    0,
+                    101,
+                    125,
+                    0,
+                    0,
+                    0,
+                    3,
+                    0,
+                    0,
+                    123,
+                    0,
+                    0,
+                    85,
+                    0,
+                    99,
+                    0,
+                    36,
+                    107,
+                    77,
+                    0,
+                    4,
+                    41,
+                    73,
+                    0,
+                    66,
+                    43,
+                    19,
+                    0,
+                    0,
+                    124,
+                    0,
+                    55,
+                    32,
+                    0,
+                    0,
+                    0,
+                    0,
+                    90,
+                    96,
+                ]
+            ]
+        )
+
+        self.assertTrue(np.allclose(sample, expected_samples))
+
+        with torch.no_grad():
+            x = model.vqvae.decode([sample], start_level=1, end_level=2, bs_chunks=sample.shape[0])
+
+        expected_x = torch.Tensor(
+            [
+                0.0595,
+                0.0952,
+                0.0354,
+                0.1182,
+                0.0312,
+                0.1063,
+                0.0306,
+                0.1336,
+                0.0369,
+                0.0902,
+                0.0332,
+                0.1230,
+                0.0322,
+                0.1036,
+                0.0332,
+                0.1352,
+                0.0382,
+                0.0941,
+                0.0302,
+                0.1226,
+                0.0313,
+                0.1077,
+                0.0316,
+                0.1375,
+                0.0392,
+                0.0961,
+                0.0303,
+                0.1233,
+                0.0342,
+                0.1067,
+                0.0334,
+                0.1359,
+                0.0404,
+                0.0963,
+                0.0309,
+                0.1218,
+                0.0319,
+                0.1069,
+                0.0323,
+                0.1373,
+                0.0398,
+                0.0952,
+                0.0310,
+                0.1237,
+                0.0348,
+                0.1058,
+                0.0336,
+                0.1370,
+                0.0410,
+                0.0954,
+                0.0306,
+                0.1224,
+                0.0331,
+                0.1081,
+                0.0323,
+                0.1365,
+                0.0410,
+                0.0982,
+                0.0331,
+                0.1223,
+                0.0368,
+                0.1070,
+                0.0338,
+                0.1359,
+                0.0416,
+                0.0976,
+                0.0328,
+                0.1214,
+                0.0346,
+                0.1087,
+                0.0328,
+                0.1364,
+                0.0393,
+                0.0973,
+                0.0333,
+                0.1236,
+                0.0361,
+                0.1074,
+                0.0337,
+                0.1361,
+                0.0409,
+                0.0967,
+                0.0322,
+                0.1222,
+                0.0342,
+                0.1090,
+                0.0320,
+                0.1374,
+                0.0398,
+                0.0985,
+                0.0331,
+                0.1231,
+                0.0362,
+                0.1074,
+                0.0335,
+                0.1360,
+                0.0410,
+                0.0971,
+                0.0325,
+                0.1220,
+            ]
+        )
+
+        first_100 = x.squeeze(-1)[0][0:100]
+        self.assertTrue(torch.allclose(first_100, expected_x, atol=1e-4))
+
+        sampling_temperature = 0.98
+        lower_batch_size = 16
+        max_batch_size = 16
+        lower_level_chunk_size = 32
+        chunk_size = 32
+        sampling_kwargs = [
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
+            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size),
+        ]
+        config.hop_fraction = [0.125, 0.5, 0.5]
+        config.n_samples = 1
+
+        tokens = tokenizer(
+            "Alan Jackson",
+            "rock",
+            "old town road",
+            total_length=config.sample_length_in_seconds * config.sr,
+            sample_length=32768,
+            offset=0,
+            duration=1,
+        )
+
+        inputs, _ = tokens["input_ids"], tokens["attention_masks"]
+
+        ys = np.array([[inputs]] * 3, dtype=np.int64)
+        ys = torch.stack([torch.from_numpy(y) for y in ys], dim=0).to("cpu").long()
+
+        start = timeit.default_timer()
+        zs = model.ancestral_sample(ys, sampling_kwargs, config)
+        print(f"time to sample : {timeit.default_timer() - start}")
+        print(zs)
+        top_50_expected_zs = torch.Tensor(
+            [
+                33,
+                90,
+                94,
+                17,
+                88,
+                88,
+                31,
+                65,
+                127,
+                112,
+                26,
+                58,
+                107,
+                5,
+                89,
+                53,
+                80,
+                48,
+                98,
+                68,
+                1,
+                33,
+                80,
+                80,
+                126,
+                2,
+                53,
+                8,
+                16,
+                45,
+                35,
+                64,
+                75,
+                10,
+                16,
+                11,
+                65,
+                39,
+                85,
+                17,
+                112,
+                44,
+                68,
+                63,
+                16,
+                127,
+                35,
+                90,
+                51,
+                27,
+            ]
+        )
+
+        self.assertTrue(torch.allclose(zs[0][0][0:50], top_50_expected_zs.long(), atol=1e-4))
+
+    def test_gpu_sampling(self):
+        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-1b-lyrics-local").eval()  # .to("cuda")
+
+        # model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 386 * [0]]).long().to("cuda"), chunk_size=32)
+
+        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox", max_n_lyric_tokens=384)
+
+        sampling_temperature = 0.98
+        lower_batch_size = 16
+        max_batch_size = 16
+        lower_level_chunk_size = 32
+        chunk_size = 32
+        sampling_kwargs = [
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
+            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size),
+        ]
+
+        model.config.sr = 44100
+        model.config.hop_fraction = [0.125, 0.5, 0.5]
+        model.config.n_samples = 1
+        model.config.sample_length = 2 * model.config.sr  # 32768
+
+        model.config.sample_length_in_seconds = 2
+        model.config.total_sample_length_in_seconds = 180
+
+        metas = dict(
+            artist="Zac Brown Band",
+            genres="Country",
+            total_length=model.config.total_sample_length_in_seconds * model.config.sr,
+            offset=0,
+            lyrics="""I met a traveller from an antique land,
+            Who said—“Two vast and trunkless legs of stone
+            Stand in the desert. . . . Near them, on the sand,
+            Half sunk a shattered visage lies, whose frown,
+            And wrinkled lip, and sneer of cold command,
+            Tell that its sculptor well those passions read
+            Which yet survive, stamped on these lifeless things,
+            The hand that mocked them, and the heart that fed;
+            And on the pedestal, these words appear:
+            My name is Ozymandias, King of Kings;
+            Look on my Works, ye Mighty, and despair!
+            Nothing beside remains. Round the decay
+            Of that colossal Wreck, boundless and bare
+            The lone and level sands stretch far away
+            """,
+            duration=2,
+            sample_length=2 * model.config.sr,
+        )
+
+        # tokens = tokenizer(
+        #     "Alan Jackson",
+        #     "rock",
+        #     "old town road",
+        #     total_length=model.config.total_sample_length_in_seconds * model.config.sr,
+        #     sample_length=2*model.config.sr,#32768, # 256 tokens from level 0, as row_to_tokens is 128
+        #     offset=0,
+        #     duration=2,
+        # )
+
+        tokens = tokenizer(**metas)
+
+        inputs, _ = tokens["input_ids"], tokens["attention_masks"]
+        ys = np.array([[inputs]] * 3, dtype=np.int64)
+        ys = torch.stack([torch.from_numpy(y) for y in ys], dim=0).long()  # .to("cuda")
+
+        start = timeit.default_timer()
+        # import cProfile as profile
+        # profile.runctx('model.ancestral_sample(ys, sampling_kwargs, config)', globals(), locals())
+        zs = model.ancestral_sample(ys, sampling_kwargs, model.config)
+        print(f"time to sample : {timeit.default_timer() - start}")
+
+
+if __name__ == "__main__":
+    tester = JukeboxModelTest()
+    tester.test_gpu_sampling()
+
+# class JukeboxModelTester:
+#     def __init__(
+#         self,
+#         parent,
+#         batch_size=14,
+#         seq_length=7,
+#         is_training=True,
+#         use_token_type_ids=True,
+#         use_input_mask=True,
+#         use_labels=True,
+#         use_mc_token_ids=True,
+#         vocab_size=99,
+#         hidden_size=32,
+#         num_hidden_layers=5,
+#         num_attention_heads=4,
+#         intermediate_size=37,
+#         hidden_act="gelu",
+#         hidden_dropout_prob=0.1,
+#         attention_probs_dropout_prob=0.1,
+#         max_position_embeddings=512,
+#         type_vocab_size=16,
+#         type_sequence_label_size=2,
+#         initializer_range=0.02,
+#         num_labels=3,
+#         num_choices=4,
+#         scope=None,
+#     ):
+#         self.parent = parent
+#         self.batch_size = batch_size
+#         self.seq_length = seq_length
+#         self.is_training = is_training
+#         self.use_token_type_ids = use_token_type_ids
+#         self.use_input_mask = use_input_mask
+#         self.use_labels = use_labels
+#         self.use_mc_token_ids = use_mc_token_ids
+#         self.vocab_size = vocab_size
+#         self.hidden_size = hidden_size
+#         self.num_hidden_layers = num_hidden_layers
+#         self.num_attention_heads = num_attention_heads
+#         self.intermediate_size = intermediate_size
+#         self.hidden_act = hidden_act
+#         self.hidden_dropout_prob = hidden_dropout_prob
+#         self.attention_probs_dropout_prob = attention_probs_dropout_prob
+#         self.max_position_embeddings = max_position_embeddings
+#         self.type_vocab_size = type_vocab_size
+#         self.type_sequence_label_size = type_sequence_label_size
+#         self.initializer_range = initializer_range
+#         self.num_labels = num_labels
+#         self.num_choices = num_choices
+#         self.scope = None
+#         self.bos_token_id = vocab_size - 1
+#         self.eos_token_id = vocab_size - 1
+#         self.pad_token_id = vocab_size - 1
+
+#     def get_large_model_config(self):
+#         return JukeboxConfig.from_pretrained("jukebox")
+
+#     def prepare_config_and_inputs(
+#         self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
+#     ):
+#         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+#         input_mask = None
+#         if self.use_input_mask:
+#             input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+#         token_type_ids = None
+#         if self.use_token_type_ids:
+#             token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+#         mc_token_ids = None
+#         if self.use_mc_token_ids:
+#             mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
+
+#         sequence_labels = None
+#         token_labels = None
+#         choice_labels = None
+#         if self.use_labels:
+#             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+#             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+#             choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+#         config = self.get_config(
+#             gradient_checkpointing=gradient_checkpointing,
+#             scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+#             reorder_and_upcast_attn=reorder_and_upcast_attn,
+#         )
+
+#         head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
+
+#         return (
+#             config,
+#             input_ids,
+#             input_mask,
+#             head_mask,
+#             token_type_ids,
+#             mc_token_ids,
+#             sequence_labels,
+#             token_labels,
+#             choice_labels,
+#         )
+
+#     def get_config(
+#         self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
+#     ):
+#         return JukeboxConfig(
+#             vocab_size=self.vocab_size,
+#             n_embd=self.hidden_size,
+#             n_layer=self.num_hidden_layers,
+#             n_head=self.num_attention_heads,
+#             n_inner=self.intermediate_size,
+#             activation_function=self.hidden_act,
+#             resid_pdrop=self.hidden_dropout_prob,
+#             attn_pdrop=self.attention_probs_dropout_prob,
+#             n_positions=self.max_position_embeddings,
+#             type_vocab_size=self.type_vocab_size,
+#             initializer_range=self.initializer_range,
+#             use_cache=True,
+#             bos_token_id=self.bos_token_id,
+#             eos_token_id=self.eos_token_id,
+#             pad_token_id=self.pad_token_id,
+#             gradient_checkpointing=gradient_checkpointing,
+#             scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+#             reorder_and_upcast_attn=reorder_and_upcast_attn,
+#         )
+
+#     def prepare_config_and_inputs_for_decoder(self):
+#         (
+#             config,
+#             input_ids,
+#             input_mask,
+#             head_mask,
+#             token_type_ids,
+#             mc_token_ids,
+#             sequence_labels,
+#             token_labels,
+#             choice_labels,
+#         ) = self.prepare_config_and_inputs()
+
+#         encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+#         encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+#         return (
+#             config,
+#             input_ids,
+#             input_mask,
+#             head_mask,
+#             token_type_ids,
+#             sequence_labels,
+#             token_labels,
+#             choice_labels,
+#             encoder_hidden_states,
+#             encoder_attention_mask,
+#         )
+
+#     def create_and_check_jukebox_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+#         model = JukeboxModel(config=config)
+#         model.to(torch_device)
+#         model.eval()
+
+#         result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
+#         result = model(input_ids, token_type_ids=token_type_ids)
+#         result = model(input_ids)
+
+#         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+#         self.parent.assertEqual(len(result.past_key_values), config.n_layer)
+
+#     def create_and_check_jukebox_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+#         model = JukeboxModel(config=config)
+#         model.to(torch_device)
+#         model.eval()
+
+#         # first forward pass
+#         outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
+#         outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
+#         outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
+
+#         self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
+#         self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
+
+#         output, past = outputs.to_tuple()
+
+#         # create hypothetical next token and extent to next_input_ids
+#         next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+#         next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
+
+#         # append to next input_ids and token_type_ids
+#         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+#         next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+
+#         output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
+#         output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
+#             "last_hidden_state"
+#         ]
+
+#         # select random slice
+#         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+#         output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+#         output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+#         # test that outputs are equal for slice
+#         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+#     def create_and_check_jukebox_model_attention_mask_past(
+#         self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+#     ):
+#         model = JukeboxModel(config=config)
+#         model.to(torch_device)
+#         model.eval()
+
+#         # create attention mask
+#         attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
+#         half_seq_length = self.seq_length // 2
+#         attn_mask[:, half_seq_length:] = 0
+
+#         # first forward pass
+#         output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
+
+#         # create hypothetical next token and extent to next_input_ids
+#         next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
+
+#         # change a random masked slice from input_ids
+#         random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
+#         random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
+#         input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
+
+#         # append to next input_ids and attn_mask
+#         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+#         attn_mask = torch.cat(
+#             [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
+#             dim=1,
+#         )
+
+#         # get two different outputs
+#         output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
+#         output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
+
+#         # select random slice
+#         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+#         output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
+#         output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
+
+#         # test that outputs are equal for slice
+#         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+#     def create_and_check_jukebox_model_past_large_inputs(
+#         self, config, input_ids, input_mask, head_mask, token_type_ids, *args
+#     ):
+#         model = JukeboxModel(config=config)
+#         model.to(torch_device)
+#         model.eval()
+
+#         # first forward pass
+#         outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
+
+#         output, past = outputs.to_tuple()
+
+#         # create hypothetical next token and extent to next_input_ids
+#         next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+#         next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
+#         next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+#         # append to next input_ids and token_type_ids
+#         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+#         next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
+#         next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+#         output_from_no_past = model(
+#             next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
+#         )["last_hidden_state"]
+#         output_from_past = model(
+#             next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
+#         )["last_hidden_state"]
+#         self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
+
+#         # select random slice
+#         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+#         output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+#         output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+#         # test that outputs are equal for slice
+#         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+#     def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
+#         model = JukeboxLMHeadModel(config)
+#         model.to(torch_device)
+#         model.eval()
+
+#         result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+#         self.parent.assertEqual(result.loss.shape, ())
+#         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+#     def create_and_check_forward_and_backwards(
+#         self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
+#     ):
+#         model = JukeboxLMHeadModel(config)
+#         model.to(torch_device)
+#         if gradient_checkpointing:
+#             model.gradient_checkpointing_enable()
+
+#         result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
+#         self.parent.assertEqual(result.loss.shape, ())
+#         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+#         result.loss.backward()
+
+#     def create_and_check_double_lm_head_model(
+#         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
+#     ):
+#         model = JukeboxDoubleHeadsModel(config)
+#         model.to(torch_device)
+#         model.eval()
+
+#         multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+#         multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+#         multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+
+#         inputs = {
+#             "input_ids": multiple_choice_inputs_ids,
+#             "mc_token_ids": mc_token_ids,
+#             "attention_mask": multiple_choice_input_mask,
+#             "token_type_ids": multiple_choice_token_type_ids,
+#             "labels": multiple_choice_inputs_ids,
+#         }
+
+#         result = model(**inputs)
+#         self.parent.assertEqual(result.loss.shape, ())
+#         self.parent.assertEqual(
+#             result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
+#         )
+#         self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
+
+#     def create_and_check_jukebox_for_sequence_classification(
+#         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+#     ):
+#         config.num_labels = self.num_labels
+#         model = JukeboxForSequenceClassification(config)
+#         model.to(torch_device)
+#         model.eval()
+#         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+#         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+#     def create_and_check_jukebox_for_token_classification(
+#         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
+#     ):
+#         config.num_labels = self.num_labels
+#         model = JukeboxForTokenClassification(config)
+#         model.to(torch_device)
+#         model.eval()
+#         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+#         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+#     def create_and_check_jukebox_weight_initialization(self, config, *args):
+#         model = JukeboxModel(config)
+#         model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
+#         for key in model.state_dict().keys():
+#             if "c_proj" in key and "weight" in key:
+#                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
+#                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
+
+#     def prepare_config_and_inputs_for_common(self):
+#         config_and_inputs = self.prepare_config_and_inputs()
+
+#         (
+#             config,
+#             input_ids,
+#             input_mask,
+#             head_mask,
+#             token_type_ids,
+#             mc_token_ids,
+#             sequence_labels,
+#             token_labels,
+#             choice_labels,
+#         ) = config_and_inputs
+
+#         inputs_dict = {
+#             "input_ids": input_ids,
+#             "token_type_ids": token_type_ids,
+#             "head_mask": head_mask,
+#         }
+
+#         return config, inputs_dict
+
+
+# @require_torch
+# class JukeboxModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+
+#     all_model_classes = (
+#         (
+#             JukeboxModel,
+#             JukeboxLMHeadModel,
+#             JukeboxDoubleHeadsModel,
+#             JukeboxForSequenceClassification,
+#             JukeboxForTokenClassification,
+#         )
+#         if is_torch_available()
+#         else ()
+#     )
+#     all_generative_model_classes = (JukeboxLMHeadModel, JukeboxDoubleHeadsModel) if is_torch_available() else ()
+#     all_parallelizable_model_classes = (JukeboxLMHeadModel, JukeboxDoubleHeadsModel) if is_torch_available() else ()
+#     fx_compatible = False
+#     test_missing_keys = False
+#     test_model_parallel = True
+
+#     # special case for DoubleHeads model
+#     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+#         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+#         if return_labels:
+#             if model_class.__name__ == "JukeboxDoubleHeadsModel":
+#                 inputs_dict["labels"] = torch.zeros(
+#                     (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length),
+#                     dtype=torch.long,
+#                     device=torch_device,
+#                 )
+#                 inputs_dict["input_ids"] = inputs_dict["labels"]
+#                 inputs_dict["token_type_ids"] = inputs_dict["labels"]
+#                 inputs_dict["mc_token_ids"] = torch.zeros(
+#                     (self.model_tester.batch_size, self.model_tester.num_choices),
+#                     dtype=torch.long,
+#                     device=torch_device,
+#                 )
+#                 inputs_dict["mc_labels"] = torch.zeros(
+#                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
+#                 )
+#         return inputs_dict
+
+#     def setUp(self):
+#         self.model_tester = JukeboxModelTester(self)
+#         self.config_tester = ConfigTester(self, config_class=JukeboxConfig, n_embd=37)
+
+#     def test_config(self):
+#         self.config_tester.run_common_tests()
+
+#     def test_jukebox_model(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_jukebox_model(*config_and_inputs)
+
+#     def test_jukebox_model_past(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_jukebox_model_past(*config_and_inputs)
+
+#     def test_jukebox_model_att_mask_past(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_jukebox_model_attention_mask_past(*config_and_inputs)
+
+#     def test_jukebox_model_past_large_inputs(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_jukebox_model_past_large_inputs(*config_and_inputs)
+
+#     def test_jukebox_lm_head_model(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
+
+#     def test_jukebox_double_lm_head_model(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
+
+#     def test_jukebox_sequence_classification_model(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_jukebox_for_sequence_classification(*config_and_inputs)
+
+#     def test_jukebox_token_classification_model(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_jukebox_for_token_classification(*config_and_inputs)
+
+#     def test_jukebox_gradient_checkpointing(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
+
+#     def test_jukebox_scale_attn_by_inverse_layer_idx(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True)
+#         self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
+
+#     def test_jukebox_reorder_and_upcast_attn(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True)
+#         self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
+
+#     def test_jukebox_weight_initialization(self):
+#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
+#         self.model_tester.create_and_check_jukebox_weight_initialization(*config_and_inputs)
+
+#     @slow
+#     def test_batch_generation(self):
+#         model = JukeboxLMHeadModel.from_pretrained("jukebox")
+#         model.to(torch_device)
+#         tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
+
+#         tokenizer.padding_side = "left"
+
+#         # Define PAD Token = EOS Token = 50256
+#         tokenizer.pad_token = tokenizer.eos_token
+#         model.config.pad_token_id = model.config.eos_token_id
+
+#         # use different length sentences to test batching
+#         sentences = [
+#             "Hello, my dog is a little",
+#             "Today, I",
+#         ]
+
+#         inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+#         input_ids = inputs["input_ids"].to(torch_device)
+#         token_type_ids = torch.cat(
+#             [
+#                 input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
+#                 input_ids.new_full((input_ids.shape[0], 1), 500),
+#             ],
+#             dim=-1,
+#         )
+
+#         outputs = model.generate(
+#             input_ids=input_ids,
+#             attention_mask=inputs["attention_mask"].to(torch_device),
+#         )
+
+#         outputs_tt = model.generate(
+#             input_ids=input_ids,
+#             attention_mask=inputs["attention_mask"].to(torch_device),
+#             token_type_ids=token_type_ids,
+#         )
+
+#         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
+#         output_non_padded = model.generate(input_ids=inputs_non_padded)
+
+#         num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+#         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
+#         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
+
+#         batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+#         batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
+#         non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+#         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
+
+#         expected_output_sentence = [
+#             "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
+#             "Today, I'm going to be doing a lot of research on this. I",
+#         ]
+#         self.assertListEqual(expected_output_sentence, batch_out_sentence)
+#         self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
+#         self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+
+#     @slow
+#     def test_batch_generation_2heads(self):
+#         model = JukeboxDoubleHeadsModel.from_pretrained("jukebox")
+#         model.to(torch_device)
+#         tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
+
+#         tokenizer.padding_side = "left"
+
+#         # This tokenizer has no pad token, so we have to set it in some way
+#         # Define PAD Token = EOS Token = 50256
+#         tokenizer.pad_token = tokenizer.eos_token
+#         model.config.pad_token_id = model.config.eos_token_id
+
+#         # use different length sentences to test batching
+#         sentences = [
+#             "Hello, my dog is a little",
+#             "Today, I",
+#         ]
+
+#         inputs = tokenizer(sentences, return_tensors="pt", padding=True)
+#         input_ids = inputs["input_ids"].to(torch_device)
+#         token_type_ids = torch.cat(
+#             [
+#                 input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
+#                 input_ids.new_full((input_ids.shape[0], 1), 500),
+#             ],
+#             dim=-1,
+#         )
+
+#         outputs = model.generate(
+#             input_ids=input_ids,
+#             attention_mask=inputs["attention_mask"].to(torch_device),
+#         )
+
+#         outputs_tt = model.generate(
+#             input_ids=input_ids,
+#             attention_mask=inputs["attention_mask"].to(torch_device),
+#             token_type_ids=token_type_ids,
+#         )
+
+#         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
+#         output_non_padded = model.generate(input_ids=inputs_non_padded)
+
+#         num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
+#         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
+#         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
+
+#         batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+#         batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
+#         non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
+#         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
+
+#         expected_output_sentence = [
+#             "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
+#             "Today, I'm going to be doing a lot of research on this. I",
+#         ]
+#         self.assertListEqual(expected_output_sentence, batch_out_sentence)
+#         self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
+#         self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
+
+#     @slow
+#     def test_model_from_pretrained(self):
+#         for model_name in JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+#             model = JukeboxModel.from_pretrained(model_name)
+#             self.assertIsNotNone(model)
+
+
+# @require_torch
+# class JukeboxModelLanguageGenerationTest(unittest.TestCase):
+#     def _test_lm_generate_jukebox_helper(
+#         self,
+#         gradient_checkpointing=False,
+#         reorder_and_upcast_attn=False,
+#         scale_attn_by_inverse_layer_idx=False,
+#         verify_outputs=True,
+#     ):
+#         model = JukeboxLMHeadModel.from_pretrained(
+#             "jukebox",
+#             reorder_and_upcast_attn=reorder_and_upcast_attn,
+#             scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+#         )
+#         if gradient_checkpointing:
+#             model.gradient_checkpointing_enable()
+#         else:
+#             model.gradient_checkpointing_disable()
+#         model.to(torch_device)
+
+#         # The dog
+#         input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)
+
+#         # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
+#         # fmt: off
+#         expected_output_ids = [
+#             464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290,
+#         ]
+#         # fmt: on
+#         output_ids = model.generate(input_ids, do_sample=False)
+#         if verify_outputs:
+#             self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
+
+#     @slow
+#     def test_lm_generate_jukebox(self):
+#         self._test_lm_generate_jukebox_helper()
+
+#     @slow
+#     def test_lm_generate_jukebox_with_gradient_checkpointing(self):
+#         self._test_lm_generate_jukebox_helper(gradient_checkpointing=True)
+
+#     @slow
+#     def test_lm_generate_jukebox_with_reorder_and_upcast_attn(self):
+#         self._test_lm_generate_jukebox_helper(reorder_and_upcast_attn=True)
+
+#     @slow
+#     def test_lm_generate_jukebox_with_scale_attn_by_inverse_layer_idx(self):
+#         self._test_lm_generate_jukebox_helper(scale_attn_by_inverse_layer_idx=True, verify_outputs=False)
+
+#     @slow
+#     def test_jukebox_sample(self):
+#         tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
+#         model = JukeboxLMHeadModel.from_pretrained("jukebox")
+#         model.to(torch_device)
+
+#         torch.manual_seed(0)
+#         tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
+#         input_ids = tokenized.input_ids.to(torch_device)
+#         output_ids = model.generate(input_ids, do_sample=True)
+#         output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+#         token_type_ids = tokenized.token_type_ids.to(torch_device)
+#         output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
+#         output_seq_tt = model.generate(
+#             input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
+#         )
+#         output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
+#         output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
+
+#         EXPECTED_OUTPUT_STR = (
+#             "Today is a nice day and if you don't know anything about the state of play during your holiday"
+#         )
+#         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
+#         self.assertTrue(
+#             all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
+#         )  # token_type_ids should change output
+
+#     @slow
+#     def test_jukebox_sample_max_time(self):
+#         tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
+#         model = JukeboxLMHeadModel.from_pretrained("jukebox")
+#         model.to(torch_device)
+
+#         torch.manual_seed(0)
+#         tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
+#         input_ids = tokenized.input_ids.to(torch_device)
+
+#         MAX_TIME = 0.5
+
+#         start = datetime.datetime.now()
+#         model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256)
+#         duration = datetime.datetime.now() - start
+#         self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
+#         self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
+
+#         start = datetime.datetime.now()
+#         model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256)
+#         duration = datetime.datetime.now() - start
+#         self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
+#         self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
+
+#         start = datetime.datetime.now()
+#         model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256)
+#         duration = datetime.datetime.now() - start
+#         self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
+#         self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
+
+#         start = datetime.datetime.now()
+#         model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256)
+#         duration = datetime.datetime.now() - start
+#         self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
+#         self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
+
+#         start = datetime.datetime.now()
+#         model.generate(input_ids, do_sample=False, max_time=None, max_length=256)
+#         duration = datetime.datetime.now() - start
+#         self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
new file mode 100644
index 0000000000000..d26ef85fc816d
--- /dev/null
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -0,0 +1,211 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# import json
+# import os
+# import unittest
+
+# from transformers import JukeboxTokenizer, JukeboxTokenizerFast
+# from transformers.models.jukebox.tokenization_jukebox import VOCAB_FILES_NAMES
+# from transformers.testing_utils import require_tokenizers
+
+import unittest
+
+# from ..test_tokenization_common import TokenizerTesterMixin
+from transformers import JukeboxTokenizer
+
+
+class JukeBoxIntegrationTest(unittest.TestCase):
+
+    # @slow
+    def test_tokenizer(self):
+        """
+        how to run the same test with openAI
+        ...
+        """
+
+        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
+        tokenizer.max_n_lyric_tokens = 20
+        tokens = tokenizer("Alan Jackson", "rock", "old town road", 4 * 60 * 44100, 8192 * 8 * 4 * 4, 0, 1)
+        inputs, attention_masks = tokens["input_ids"], tokens["attention_masks"]
+        EXPECTED_OUTPUT = [
+            10584000,
+            0,
+            1048576,
+            145,
+            8,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            41,
+            38,
+            30,
+            77,
+            46,
+            41,
+            49,
+            40,
+            77,
+            44,
+            41,
+            27,
+            30,
+        ]
+
+        self.assertTrue(inputs == EXPECTED_OUTPUT)
+        EXPECTED_MASK_OUTPUT = [-float("inf")] * 7 + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.assertTrue(attention_masks == EXPECTED_MASK_OUTPUT)
+
+
+# @require_tokenizers
+# class JukeboxTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+#     tokenizer_class = JukeboxTokenizer
+#     rust_tokenizer_class = JukeboxTokenizerFast
+#     test_rust_tokenizer = True
+#     from_pretrained_kwargs = {"add_prefix_space": True}
+#     test_seq2seq = False
+
+#     def setUp(self):
+#         super().setUp()
+
+#         vocab = {
+#             "artist": {"Marron 5": 0, "Bob Marley": 1},
+#             "genres": {"Pop": 0, "Rap": 1},
+#             "lyrics": {
+#                 c: i
+#                 for c, i in enumerate(
+#                     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-'\"()[] \t\n"
+#                 )
+#             },
+#         }
+#         self.special_tokens_map = {"unk_token": "<unk>"}
+
+#         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+#         with open(self.vocab_file, "w", encoding="utf-8") as fp:
+#             fp.write(json.dumps(vocab) + "\n")
+
+#     def get_tokenizer(self, **kwargs):
+#         kwargs.update(self.special_tokens_map)
+#         return JukeboxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+#     def get_rust_tokenizer(self, **kwargs):
+#         kwargs.update(self.special_tokens_map)
+#         return JukeboxTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
+
+#     def get_input_output_texts(self, tokenizer):
+#         input_text = "lower newer"
+#         output_text = "lower newer"
+#         return input_text, output_text
+
+#     # TODO: mostly modify this part
+#     def test_full_tokenizer(self):
+#         tokenizer = JukeboxTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+#         text = "lower newer"
+#         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
+#         tokens = tokenizer.tokenize(text, add_prefix_space=True)
+#         self.assertListEqual(tokens, bpe_tokens)
+
+#         input_tokens = tokens + [tokenizer.unk_token]
+#         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+#         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+#     def test_rust_and_python_full_tokenizers(self):
+#         if not self.test_rust_tokenizer:
+#             return
+
+#         tokenizer = self.get_tokenizer()
+#         rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+
+#         sequence = "lower newer"
+
+#         # Testing tokenization
+#         tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
+#         rust_tokens = rust_tokenizer.tokenize(sequence)
+#         self.assertListEqual(tokens, rust_tokens)
+
+#         # Testing conversion to ids without special tokens
+#         ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
+#         rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+#         self.assertListEqual(ids, rust_ids)
+
+#         # Testing conversion to ids with special tokens
+#         rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
+#         ids = tokenizer.encode(sequence, add_prefix_space=True)
+#         rust_ids = rust_tokenizer.encode(sequence)
+#         self.assertListEqual(ids, rust_ids)
+
+#         # Testing the unknown token
+#         input_tokens = tokens + [rust_tokenizer.unk_token]
+#         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
+#         self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+#     def test_pretokenized_inputs(self, *args, **kwargs):
+#         # It's very difficult to mix/test pretokenization with byte-level
+#         # And get both Jukebox and Roberta to work at the same time (mostly an issue of adding a space before the string)
+#         pass
+
+#     def test_padding(self, max_length=15):
+#         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+#             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+#                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+#                 # Simple input
+#                 s = "This is a simple input"
+#                 s2 = ["This is a simple input 1", "This is a simple input 2"]
+#                 p = ("This is a simple input", "This is a pair")
+#                 p2 = [
+#                     ("This is a simple input 1", "This is a simple input 2"),
+#                     ("This is a simple pair 1", "This is a simple pair 2"),
+#                 ]
+
+#                 # Simple input tests
+#                 self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
+
+#                 # Simple input
+#                 self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+
+#                 # Simple input
+#                 self.assertRaises(
+#                     ValueError,
+#                     tokenizer_r.batch_encode_plus,
+#                     s2,
+#                     max_length=max_length,
+#                     padding="max_length",
+#                 )
+
+#                 # Pair input
+#                 self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
+
+#                 # Pair input
+#                 self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+
+#                 # Pair input
+#                 self.assertRaises(
+#                     ValueError,
+#                     tokenizer_r.batch_encode_plus,
+#                     p2,
+#                     max_length=max_length,
+#                     padding="max_length",
+#                 )
+
+#     # tokenizer has no padding token
+#     def test_padding_different_model_input_name(self):
+#         pass

From d3162116d2f53764ba23de60aa6a4c763ede0656 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Thu, 23 Jun 2022 10:45:26 +0200
Subject: [PATCH 002/196] delete tf related function

---
 src/transformers/__init__.py                                  | 2 --
 src/transformers/models/jukebox/__init__.py                   | 2 --
 .../convert_jukebox_original_tf_checkpoint_to_pytorch.py      | 4 ++--
 src/transformers/models/jukebox/modeling_jukebox.py           | 1 -
 src/transformers/utils/dummy_pt_objects.py                    | 3 ---
 5 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index cda1befad22fa..e4a77e1e001d8 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1225,7 +1225,6 @@
             "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
             "JukeboxModel",
             "JukeboxPreTrainedModel",
-            "load_tf_weights_in_jukebox",
         ]
     )
     _import_structure["models.layoutlm"].extend(
@@ -3710,7 +3709,6 @@
             JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             JukeboxModel,
             JukeboxPreTrainedModel,
-            load_tf_weights_in_jukebox,
         )
         from .models.layoutlm import (
             LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index 4075d104e72c4..31a6fe650e712 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -36,7 +36,6 @@
         "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
         "JukeboxModel",
         "JukeboxPreTrainedModel",
-        "load_tf_weights_in_jukebox",
     ]
 
 if TYPE_CHECKING:
@@ -53,7 +52,6 @@
             JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             JukeboxModel,
             JukeboxPreTrainedModel,
-            load_tf_weights_in_jukebox,
         )
 
 else:
diff --git a/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
index 8b31282f253be..5f19a3949fa94 100644
--- a/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
@@ -19,7 +19,7 @@
 
 import torch
 
-from transformers import JukeboxConfig, JukeboxModel, load_tf_weights_in_jukebox
+from transformers import JukeboxConfig, JukeboxModel
 from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
 
 
@@ -35,7 +35,7 @@ def convert_jukebox_checkpoint_to_pytorch(jukebox_checkpoint_path, jukebox_confi
     model = JukeboxModel(config)
 
     # Load weights from numpy
-    load_tf_weights_in_jukebox(model, config, jukebox_checkpoint_path)
+    # load_tf_weights_in_jukebox(model, config, jukebox_checkpoint_path)
 
     # Save pytorch-model
     pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index ba402da1a253f..d01698cb4eb98 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -3036,7 +3036,6 @@ class JukeboxPreTrainedModel(PreTrainedModel):
     """
 
     config_class = JukeboxConfig
-    # load_tf_weights = load_tf_weights_in_jukebox
     base_model_prefix = "transformer"
     is_parallelizable = True
     supports_gradient_checkpointing = True
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index e1d968dcecca3..d11db7e960947 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2394,9 +2394,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-def load_tf_weights_in_jukebox(*args, **kwargs):
-    requires_backends(load_tf_weights_in_jukebox, ["torch"])
-
 
 LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
 

From 0263507921c0a6ced6b485e7c1af2cc644679be2 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Thu, 23 Jun 2022 10:48:02 +0200
Subject: [PATCH 003/196] fix copies

---
 README.md                                   | 1 +
 README_ko.md                                | 1 +
 README_zh-hans.md                           | 1 +
 README_zh-hant.md                           | 1 +
 docs/source/en/index.mdx                    | 1 +
 src/transformers/__init__.py                | 6 +-----
 src/transformers/models/jukebox/__init__.py | 6 +-----
 src/transformers/utils/dummy_pt_objects.py  | 1 -
 8 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index b03cc35753c7c..1be495f1210c7 100644
--- a/README.md
+++ b/README.md
@@ -277,6 +277,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/README_ko.md b/README_ko.md
index f977b1ecc3da0..3b84b46fec0fb 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -258,6 +258,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 4605918eb2eac..ce0348bad633c 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -282,6 +282,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 由 Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index b4db503bc83ba..0cba1073ffdfb 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -294,6 +294,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index cd5ffd9f52f5f..ec470dd01861f 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -100,6 +100,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e4a77e1e001d8..8a8fa3f00122e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3705,11 +3705,7 @@
             ImageGPTPreTrainedModel,
             load_tf_weights_in_imagegpt,
         )
-        from .models.jukebox import (
-            JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
-            JukeboxModel,
-            JukeboxPreTrainedModel,
-        )
+        from .models.jukebox import JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST, JukeboxModel, JukeboxPreTrainedModel
         from .models.layoutlm import (
             LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
             LayoutLMForMaskedLM,
diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index 31a6fe650e712..483feabc0d50b 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -48,11 +48,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_jukebox import (
-            JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
-            JukeboxModel,
-            JukeboxPreTrainedModel,
-        )
+        from .modeling_jukebox import JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST, JukeboxModel, JukeboxPreTrainedModel
 
 else:
     import sys
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index d11db7e960947..8627713f80f97 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2394,7 +2394,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-
 LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 

From 4f6790900a2fad1fcdcefa719ae515c27e9db9b1 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Thu, 23 Jun 2022 14:51:53 +0200
Subject: [PATCH 004/196] update

---
 src/transformers/__init__.py                  |  1 -
 src/transformers/models/jukebox/__init__.py   |  2 +-
 .../models/jukebox/configuration_jukebox.py   | 34 ++-----------------
 .../models/jukebox/convert_jukebox.py         |  1 -
 ...kebox_original_tf_checkpoint_to_pytorch.py |  6 ++--
 .../models/jukebox/modeling_jukebox.py        |  9 ++---
 .../utils/dummy_sentencepiece_objects.py      |  7 ----
 tests/models/jukebox/test_modeling_jukebox.py |  4 ++-
 8 files changed, 14 insertions(+), 50 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8a8fa3f00122e..79c6b6e727aeb 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3074,7 +3074,6 @@
         from .models.cpm import CpmTokenizer
         from .models.deberta_v2 import DebertaV2Tokenizer
         from .models.fnet import FNetTokenizer
-        from .models.jukebox import JukeboxTokenizer
         from .models.layoutxlm import LayoutXLMTokenizer
         from .models.m2m_100 import M2M100Tokenizer
         from .models.marian import MarianTokenizer
diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index 483feabc0d50b..427be11cc948a 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index ef7ec0f2201eb..5c1ce755212a1 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -28,7 +28,7 @@
 
 JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     "ArthurZ/jukebox-dummy": "https://huggingface.co/ArthurZ/jukebox-dummy/resolve/main/config.json",
-    "ArthurZ/jukebox-1h-lyrics": "https://huggingface.co/ArthurZ/jukebox-1b-lyrics/resolve/main/config.json",
+    "ArthurZ/jukebox-1b-lyrics": "https://huggingface.co/ArthurZ/jukebox-1b-lyrics/resolve/main/config.json",
 }
 
 
@@ -74,37 +74,6 @@ class JukeboxConfig(PretrainedConfig):
             The epsilon to use in the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        summary_type (`string`, *optional*, defaults to `"cls_index"`):
-            Argument used when doing sequence summary, used in the models [`JukeboxDoubleHeadsModel`] and
-            [`TFJukeboxDoubleHeadsModel`].
-
-            Has to be one of the following options:
-
-                - `"last"`: Take the last token hidden state (like XLNet).
-                - `"first"`: Take the first token hidden state (like BERT).
-                - `"mean"`: Take the mean of all tokens hidden states.
-                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - `"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (`bool`, *optional*, defaults to `True`):
-            Argument used when doing sequence summary, used in the models [`JukeboxDoubleHeadsModel`] and
-            [`TFJukeboxDoubleHeadsModel`].
-
-            Whether or not to add a projection after the vector extraction.
-        summary_activation (`str`, *optional*):
-            Argument used when doing sequence summary. Used in for the multiple choice head in
-            [`JukeboxDoubleHeadsModel`].
-
-            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
-            Argument used when doing sequence summary, used in the models [`JukeboxDoubleHeadsModel`] and
-            [`TFJukeboxDoubleHeadsModel`].
-
-            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
-        summary_first_dropout (`float`, *optional*, defaults to 0.1):
-            Argument used when doing sequence summary, used in the models [`JukeboxDoubleHeadsModel`] and
-            [`TFJukeboxDoubleHeadsModel`].
-
-            The dropout ratio to be used after the projection and activation.
         scale_attn_weights (`bool`, *optional*, defaults to `True`):
             Scale attention weights by dividing by sqrt(hidden_size)..
         use_cache (`bool`, *optional*, defaults to `True`):
@@ -141,6 +110,7 @@ class JukeboxConfig(PretrainedConfig):
     }
     # params are given for the `n` priors at the same time which means that you have
     # level2,level1,level0
+
     def __init__(
         self,
         vocab_size=50257,
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 457add5d574ed..95d4b6d7d578e 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -22,7 +22,6 @@
 
 import requests
 from transformers import JukeboxConfig, JukeboxModel
-from transformers.models import jukebox
 from transformers.utils import logging
 
 
diff --git a/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
index 5f19a3949fa94..4b417038d3040 100644
--- a/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
@@ -70,6 +70,6 @@ def convert_jukebox_checkpoint_to_pytorch(jukebox_checkpoint_path, jukebox_confi
         ),
     )
     args = parser.parse_args()
-    convert_jukebox_checkpoint_to_pytorch(
-        args.jukebox_checkpoint_path, args.jukebox_config_file, args.pytorch_dump_folder_path
-    )
+    # convert_jukebox_checkpoint_to_pytorch(
+    #     args.jukebox_checkpoint_path, args.jukebox_config_file, args.pytorch_dump_folder_path
+    # )
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index d01698cb4eb98..da649353cd518 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -42,12 +42,13 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "ArthurZ/jukebox-dummy"
-_CONFIG_FOR_DOC = "JukeboxConfig"
-_TOKENIZER_FOR_DOC = "JukeboxTokenizer"
+# _CHECKPOINT_FOR_DOC = "ArthurZ/jukebox-dummy"
+# _CONFIG_FOR_DOC = "JukeboxConfig"
+# _TOKENIZER_FOR_DOC = "JukeboxTokenizer"
 
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "ArthurZ/jukebox-dummy",
+    "ArthurZ/jukebox-1b-lyrics",
     # See all Jukebox models at https://huggingface.co/models?filter=jukebox
 ]
 
@@ -3086,7 +3087,7 @@ def _set_gradient_checkpointing(self, module, value=False):
     and behavior.
 
     Parameters:
-        config ([`JukeboxConfig`]): Model configuration class with all the parameters of the model.
+        config (`JukeboxConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 286f0833420c9..00989dc0d12a4 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -66,13 +66,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
-class JukeboxTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
 class LayoutXLMTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 8e688731aaf85..2b90205f7a08e 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 from transformers import JukeboxConfig, is_torch_available
+from transformers.testing_utils import require_torch
 from transformers.trainer_utils import set_seed
 
 
@@ -33,6 +34,7 @@
     from transformers import JukeboxModel, JukeboxTokenizer  # ,JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
+@require_torch
 class JukeboxModelTest(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
 
@@ -612,7 +614,7 @@ def test_gpu_sampling(self):
         start = timeit.default_timer()
         # import cProfile as profile
         # profile.runctx('model.ancestral_sample(ys, sampling_kwargs, config)', globals(), locals())
-        zs = model.ancestral_sample(ys, sampling_kwargs, model.config)
+        model.ancestral_sample(ys, sampling_kwargs, model.config)
         print(f"time to sample : {timeit.default_timer() - start}")
 
 

From 20cee2ebf28070ee80e4ffba5e90de3f5966ccc7 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Thu, 23 Jun 2022 15:04:50 +0200
Subject: [PATCH 005/196] style and delete fast tokenzer

---
 docs/source/en/model_doc/jukebox.mdx          |  28 +---
 docs/source/en/serialization.mdx              |   1 -
 .../models/jukebox/configuration_jukebox.py   |  97 +-----------
 .../jukebox/tokenization_jukebox_fast.py      | 147 ------------------
 4 files changed, 7 insertions(+), 266 deletions(-)
 delete mode 100644 src/transformers/models/jukebox/tokenization_jukebox_fast.py

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 8ae0d831810bf..02330e0a27c99 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -61,32 +61,6 @@ The original code can be found [here](https://github.com/openai/jukebox).
 
 [[autodoc]] JukeboxTokenizer - save_vocabulary
 
-## JukeboxTokenizerFast
-
-[[autodoc]] JukeboxTokenizerFast
-
-## Jukebox specific outputs
-
-[[autodoc]] models.jukebox.modeling_jukebox.JukeboxDoubleHeadsModelOutput
-
-[[autodoc]] models.jukebox.modeling_tf_jukebox.TFJukeboxDoubleHeadsModelOutput
-
 ## JukeboxModel
 
-[[autodoc]] JukeboxModel - forward - parallelize - deparallelize
-
-## JukeboxLMHeadModel
-
-[[autodoc]] JukeboxLMHeadModel - forward - parallelize - deparallelize
-
-## JukeboxDoubleHeadsModel
-
-[[autodoc]] JukeboxDoubleHeadsModel - forward
-
-## JukeboxForSequenceClassification
-
-[[autodoc]] JukeboxForSequenceClassification - forward
-
-## JukeboxForTokenClassification
-
-[[autodoc]] JukeboxForTokenClassification - forward
+[[autodoc]] JukeboxModel - forward 
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
index cc83b9a2502e5..c10428f6199ad 100644
--- a/docs/source/en/serialization.mdx
+++ b/docs/source/en/serialization.mdx
@@ -67,7 +67,6 @@ Ready-made configurations include the following architectures:
 - GPT Neo
 - GPT-J
 - I-BERT
-- Jukebox
 - LayoutLM
 - LongT5
 - M2M100
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 5c1ce755212a1..0f208eebe41c5 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -14,13 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Jukebox configuration"""
-from collections import OrderedDict
-from typing import Any, List, Mapping, Optional
-
-from transformers import PreTrainedTokenizer, TensorType, is_torch_available
 
 from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfigWithPast, PatchingSpec
 from ...utils import logging
 
 
@@ -34,13 +29,13 @@
 
 class JukeboxConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a [`JukeboxModel`] or a [`TFJukeboxModel`]. It is
-    used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the GPT-2
-    [small](https://huggingface.co/jukebox) architecture.
+    This is the configuration class to store the configuration of a [`JukeboxModel`].
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
+    yield a similar configuration to that of the Speech2Text
+    [ArthurZ/jukebox-1b-lyrics](https://huggingface.co/ArthurZ/jukebox-1b-lyrics/resolve/main/config.json)
+    architecture.
 
 
     The downsampling and stride are used to determine downsampling of the input sequence. For example, downsamoling =
@@ -50,7 +45,7 @@ class JukeboxConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 50257):
             Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`JukeboxModel`] or [`TFJukeboxModel`].
+            `inputs_ids` passed when calling [`JukeboxModel`]].
         n_positions (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
@@ -363,83 +358,3 @@ def __init__(
         self.eos_token_id = eos_token_id
 
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-
-
-class JukeboxOnnxConfig(OnnxConfigWithPast):
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        task: str = "default",
-        patching_specs: List[PatchingSpec] = None,
-        use_past: bool = False,
-    ):
-        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
-        if not getattr(self._config, "pad_token_id", None):
-            # TODO: how to do that better?
-            self._config.pad_token_id = 0
-
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
-        if self.use_past:
-            self.fill_with_past_key_values_(common_inputs, direction="inputs")
-            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
-        else:
-            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
-
-        return common_inputs
-
-    @property
-    def num_layers(self) -> int:
-        return self._config.n_layer
-
-    @property
-    def num_attention_heads(self) -> int:
-        return self._config.n_head
-
-    def generate_dummy_inputs(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        batch_size: int = -1,
-        seq_length: int = -1,
-        is_pair: bool = False,
-        framework: Optional[TensorType] = None,
-    ) -> Mapping[str, Any]:
-        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
-            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
-        )
-
-        # We need to order the input in the way they appears in the forward()
-        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
-
-        # Need to add the past_keys
-        if self.use_past:
-            if not is_torch_available():
-                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
-            else:
-                import torch
-
-                batch, seqlen = common_inputs["input_ids"].shape
-                # Not using the same length for past_key_values
-                past_key_values_length = seqlen + 2
-                past_shape = (
-                    batch,
-                    self.num_attention_heads,
-                    past_key_values_length,
-                    self._config.hidden_size // self.num_attention_heads,
-                )
-                ordered_inputs["past_key_values"] = [
-                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
-                ]
-
-        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
-        if self.use_past:
-            ordered_inputs["attention_mask"] = torch.cat(
-                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length)], dim=1
-            )
-
-        return ordered_inputs
-
-    @property
-    def default_onnx_opset(self) -> int:
-        return 13
diff --git a/src/transformers/models/jukebox/tokenization_jukebox_fast.py b/src/transformers/models/jukebox/tokenization_jukebox_fast.py
deleted file mode 100644
index f3948e0ef2ef8..0000000000000
--- a/src/transformers/models/jukebox/tokenization_jukebox_fast.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-
-
-import json
-from typing import TYPE_CHECKING, List, Optional, Tuple
-
-from tokenizers import pre_tokenizers
-
-from ...tokenization_utils_base import BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_jukebox import JukeboxTokenizer
-
-
-if TYPE_CHECKING:
-    from transformers.pipelines.conversational import Conversation
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "jukebox": "https://huggingface.co/jukebox/resolve/main/vocab.json",
-    },
-    "tokenizer_file": {"jukebox": "https://huggingface.co/jukebox/resolve/main/tokenizer.json"},
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"jukebox": 1024}
-
-
-class JukeboxTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" Jukebox tokenizer, backed by HuggingFace's tokenizers library. Jukebox can be conditioned on 3
-    different inputs :
-        - Artists, unique ids are associated to each artist from the provided dictionary.
-        - Genres, unique ids are associated to each genre from the provided dictionary.
-        - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
-        vocabulary.
-
-    This tokenizer is straight forward and does not require trainingg. It should be able to process a different number
-    of inputs: as the conditioning of the model can be done on the three different queries. If None is provided,
-    defaults values will be used.:
-
-    ```
-    >>> from transformers import JukeboxTokenizer
-    >>> tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
-    >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
-    [15496, 995]
-    >>> tokenizer("Alan Jackson", "Country Rock")['input_ids']
-    [15496, 995]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
-
-    <Tip>
-
-    If nothing is provided, the genres and the artist will either be selected randomly or set to None
-
-    </Tip>
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
-    to: this superclass for more information regarding those methods.
-
-    Args:
-        artitst_vocab_file (`str`):
-            Path to the vocabulary file which should contain a dictionnary where the keys are 'artist', 'genre' and
-            'lyrics' and the values are their corresponding vocabulary files.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = JukeboxTokenizer
-
-    def __init__(
-        self, vocab_file=None, tokenizer_file=None, unk_token="<|endoftext|>", add_prefix_space=False, **kwargs
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            unk_token=unk_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
-        # TODO: should it be using WordLevel tokenizer ? Don't really know how that works yet
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
-            pre_tok_state["add_prefix_space"] = add_prefix_space
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-
-        self.add_prefix_space = add_prefix_space
-
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._encode_plus(*args, **kwargs)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
-        """This corresponds to DialoGPT variants of models."""
-        input_ids = []
-        for is_user, text in conversation.iter_texts():
-            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
-
-        if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length :]
-        return input_ids

From a3dace0f45262e8ff787dcf0cc85a7f5acf2764c Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Fri, 24 Jun 2022 11:21:10 +0200
Subject: [PATCH 006/196] fix consistency check

---
 src/transformers/models/jukebox/configuration_jukebox.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 0f208eebe41c5..849db231c5707 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -22,8 +22,8 @@
 logger = logging.get_logger(__name__)
 
 JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "ArthurZ/jukebox-dummy": "https://huggingface.co/ArthurZ/jukebox-dummy/resolve/main/config.json",
-    "ArthurZ/jukebox-1b-lyrics": "https://huggingface.co/ArthurZ/jukebox-1b-lyrics/resolve/main/config.json",
+    "ArthurZ/jukebox-dummy": "https://huggingface.co/ArthurZ/jukebox-dummy/blob/main/config.json",
+    "ArthurZ/jukebox-1b-lyrics": "https://huggingface.co/ArthurZ/jukebox-1b-lyrics/blob/main/config.json",
 }
 
 
@@ -34,8 +34,7 @@ class JukeboxConfig(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
     yield a similar configuration to that of the Speech2Text
-    [ArthurZ/jukebox-1b-lyrics](https://huggingface.co/ArthurZ/jukebox-1b-lyrics/resolve/main/config.json)
-    architecture.
+    [ArthurZ/jukebox-1b-lyrics](https://huggingface.co/ArthurZ/jukebox-1b-lyrics) architecture.
 
 
     The downsampling and stride are used to determine downsampling of the input sequence. For example, downsamoling =

From 8ba643ded300199c654df85ec3450e94171928c2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 24 Jun 2022 14:51:53 +0000
Subject: [PATCH 007/196] update test and modelling

---
 .../models/jukebox/modeling_jukebox.py        | 31 ++++++++++++-------
 tests/models/jukebox/test_modeling_jukebox.py | 27 +++++++---------
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index da649353cd518..0958f35b39da3 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -24,7 +24,7 @@
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
-
+from rich.progress import Progress
 
 if version.parse(torch.__version__) >= version.parse("1.6"):
     is_amp_available = True
@@ -3276,9 +3276,13 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         z_conds_list = split_batch(z_conds, n_samples, max_batch_size)
         y_list = split_batch(y, n_samples, max_batch_size)
         z_samples = []
-        for z_i, z_conds_i, y_i in zip(z_list, z_conds_list, y_list):
-            z_samples_i = prior.sample(n_samples=z_i.shape[0], z=z_i, z_conds=z_conds_i, y=y_i, **sampling_kwargs)
-            z_samples.append(z_samples_i)
+        
+        with Progress() as progress:
+            task3 = progress.add_task("[cyan]Sampling Tokens...", total=len(z_list))
+            for z_i, z_conds_i, y_i in zip(z_list, z_conds_list, y_list):
+                z_samples_i = prior.sample(n_samples=z_i.shape[0], z=z_i, z_conds=z_conds_i, y=y_i, **sampling_kwargs)
+                z_samples.append(z_samples_i)
+                progress.update(task3, advance=1)
         z = torch.cat(z_samples, dim=0)
 
         sampling_kwargs["max_batch_size"] = max_batch_size
@@ -3294,8 +3298,11 @@ def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_len
         print(f"Sampling level {level}")
         prior = self.priors[level]
         if total_length >= prior.n_ctx:
-            for start in get_starts(total_length, prior.n_ctx, hop_length):
-                zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
+            with Progress() as progress:
+                task1 = progress.add_task("[red]Sampling single window...", total=len(get_starts(total_length, prior.n_ctx, hop_length)))
+                for start in get_starts(total_length, prior.n_ctx, hop_length):
+                    zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
+                    progress.update(task1, advance=1)
         else:
             zs = self.sample_partial_window(zs, labels, sampling_kwargs, level, total_length, hps)
         return zs
@@ -3315,12 +3322,14 @@ def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
             total_length = hps.sample_length // prior.raw_to_tokens
             hop_length = int(hps.hop_fraction[-level - 1] * prior.n_ctx)
 
+            zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
+            
             # TODO either mask them or ddo better
-            if level != len(sample_levels) - 1:
-                labels_level = labels[level][0][: 4 + hps.max_bow_genre_size].unsqueeze(0)
-                zs = self.sample_level(zs, labels_level, sampling_kwargs[level], level, total_length, hop_length, hps)
-            else:
-                zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
+            # if level != len(sample_levels) - 1:
+            #     labels_level = labels[level][0][: 4 + hps.max_bow_genre_size].unsqueeze(0)
+            #     zs = self.sample_level(zs, labels_level, sampling_kwargs[level], level, total_length, hop_length, hps)
+            # else:
+            #     zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
 
             prior.to(zs[0].device)
             empty_cache()
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 2b90205f7a08e..d8bc88b295340 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -546,12 +546,13 @@ def test_model(self):
         self.assertTrue(torch.allclose(zs[0][0][0:50], top_50_expected_zs.long(), atol=1e-4))
 
     def test_gpu_sampling(self):
-        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-1b-lyrics-local").eval()  # .to("cuda")
+        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-1b-lyrics").eval()  # .to("cuda")
 
         # model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 386 * [0]]).long().to("cuda"), chunk_size=32)
 
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox", max_n_lyric_tokens=384)
-
+        set_seed(0)
+        
         sampling_temperature = 0.98
         lower_batch_size = 16
         max_batch_size = 16
@@ -566,15 +567,15 @@ def test_gpu_sampling(self):
         model.config.sr = 44100
         model.config.hop_fraction = [0.125, 0.5, 0.5]
         model.config.n_samples = 1
-        model.config.sample_length = 2 * model.config.sr  # 32768
+        model.config.sample_length = 2645888  # 32768
 
-        model.config.sample_length_in_seconds = 2
+        model.config.sample_length_in_seconds = 60
         model.config.total_sample_length_in_seconds = 180
 
         metas = dict(
             artist="Zac Brown Band",
             genres="Country",
-            total_length=model.config.total_sample_length_in_seconds * model.config.sr,
+            total_length=2645888,
             offset=0,
             lyrics="""I met a traveller from an antique land,
             Who said—“Two vast and trunkless legs of stone
@@ -595,19 +596,13 @@ def test_gpu_sampling(self):
             sample_length=2 * model.config.sr,
         )
 
-        # tokens = tokenizer(
-        #     "Alan Jackson",
-        #     "rock",
-        #     "old town road",
-        #     total_length=model.config.total_sample_length_in_seconds * model.config.sr,
-        #     sample_length=2*model.config.sr,#32768, # 256 tokens from level 0, as row_to_tokens is 128
-        #     offset=0,
-        #     duration=2,
-        # )
-
         tokens = tokenizer(**metas)
-
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
+        zs = [torch.zeros(1,0) for _ in range(len(model.priors))]
+        labels = torch.tensor([[inputs]]*3)
+        zs = model._sample(zs,labels , sampling_kwargs, [2],model.config)
+
+        
         ys = np.array([[inputs]] * 3, dtype=np.int64)
         ys = torch.stack([torch.from_numpy(y) for y in ys], dim=0).long()  # .to("cuda")
 

From 93ae4fed829022a0e7fbf22c7cd590d3435418e4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 26 Jun 2022 15:45:34 +0000
Subject: [PATCH 008/196] add progress bar, `rich`dependency

---
 .../models/jukebox/modeling_jukebox.py        | 43 ++++++++++++++-----
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 0958f35b39da3..d92cc92fed4f3 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2215,6 +2215,8 @@ def primed_sample(
             assert x.shape == (n_samples, 1)
             empty_cache()
             # for sample_t in get_range(range(len(xs), sample_tokens)):
+            total = (sample_tokens - len(xs))
+            task3 = progress.add_task("Sampling indivdual tokens (super slow)", total = total )
             for sample_t in range(len(xs), sample_tokens):
 
                 x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
@@ -2232,6 +2234,9 @@ def primed_sample(
                 x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
                 assert x.shape == (n_samples, 1)
                 xs.append(x.clone())
+                progress.update(task3,advance=1)
+                progress.update(0,advance=1/total)
+                
 
             del x
             self.transformer.del_cache()
@@ -3195,13 +3200,33 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
     return alignments
 
 
+
+
+
+
+from rich.live import Live
+from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn,TimeRemainingColumn,TimeElapsedColumn
+
+
+
+progress = Progress(
+    "{task.description}",
+    SpinnerColumn(),
+    BarColumn(),
+    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+    TimeElapsedColumn(),
+    TimeRemainingColumn()
+)
+
+
 @add_start_docstrings(
     "The bare JUKEBOX Model from which you can sample",
     JUKEBOX_START_DOCSTRING,
 )
 class JukeboxModel(JukeboxPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
+   
+    
     def __init__(self, config):
         super().__init__(config)
 
@@ -3276,13 +3301,9 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         z_conds_list = split_batch(z_conds, n_samples, max_batch_size)
         y_list = split_batch(y, n_samples, max_batch_size)
         z_samples = []
-        
-        with Progress() as progress:
-            task3 = progress.add_task("[cyan]Sampling Tokens...", total=len(z_list))
-            for z_i, z_conds_i, y_i in zip(z_list, z_conds_list, y_list):
-                z_samples_i = prior.sample(n_samples=z_i.shape[0], z=z_i, z_conds=z_conds_i, y=y_i, **sampling_kwargs)
-                z_samples.append(z_samples_i)
-                progress.update(task3, advance=1)
+        for z_i, z_conds_i, y_i in zip(z_list, z_conds_list, y_list):
+            z_samples_i = prior.sample(n_samples=z_i.shape[0], z=z_i, z_conds=z_conds_i, y=y_i, **sampling_kwargs)
+            z_samples.append(z_samples_i)
         z = torch.cat(z_samples, dim=0)
 
         sampling_kwargs["max_batch_size"] = max_batch_size
@@ -3298,11 +3319,11 @@ def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_len
         print(f"Sampling level {level}")
         prior = self.priors[level]
         if total_length >= prior.n_ctx:
-            with Progress() as progress:
-                task1 = progress.add_task("[red]Sampling single window...", total=len(get_starts(total_length, prior.n_ctx, hop_length)))
+            with Live(progress):
+                progress.add_task("[red]Sampling single window...", total=len(get_starts(total_length, prior.n_ctx, hop_length)))
                 for start in get_starts(total_length, prior.n_ctx, hop_length):
                     zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
-                    progress.update(task1, advance=1)
+                    # progress.update(task1, advance=1)
         else:
             zs = self.sample_partial_window(zs, labels, sampling_kwargs, level, total_length, hps)
         return zs

From 684af177f66a8d0d7011f37d63f586543e3f4cc6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 27 Jun 2022 15:03:24 +0000
Subject: [PATCH 009/196] clean and add progress bars

---
 .../models/jukebox/modeling_jukebox.py        | 41 ++++++++++---------
 .../models/jukebox/tokenization_jukebox.py    |  6 +--
 tests/models/jukebox/test_modeling_jukebox.py |  9 ++--
 .../jukebox/test_tokenization_jukebox.py      |  2 +-
 4 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index d92cc92fed4f3..7b92f33b0b082 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -24,8 +24,10 @@
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
+
 from rich.progress import Progress
 
+
 if version.parse(torch.__version__) >= version.parse("1.6"):
     is_amp_available = True
     # from torch.cuda.amp import autocast
@@ -1890,7 +1892,7 @@ def __init__(
             n_depth=depth,
             attn_dropout=attn_dropout,
             resid_dropout=resid_dropout,
-            afn="relu",
+            afn="quick_gelu",
             scale=True,
             mask=mask,
             zero_out=zero_out,
@@ -2088,7 +2090,9 @@ def sample(
 
                 x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
-                x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16)  # Transformer
+                x = self.transformer(
+                    x, encoder_kv=encoder_kv, sample=True, fp16=True
+                )  # TODO put fp16 back # Transformer
                 if self.add_cond_after_transformer:
                     x = x + cond
                 assert x.shape == (n_samples, 1, self.width)
@@ -2174,6 +2178,7 @@ def primed_sample(
             start = 0
             x = None
             # for current_chunk_size in get_range(chunk_sizes):
+            task2 = progress.add_task("Primed Sampling chunks ", total=len(chunk_sizes))
             for current_chunk_size in chunk_sizes:
 
                 xs_prime, conds_prime = [], []
@@ -2202,6 +2207,8 @@ def primed_sample(
                 else:
                     del x_prime
 
+                progress.update(task2, advance=1)
+
             if get_preds:
                 x_prime = torch.cat(x_primes, dim=1)
                 assert x_prime.shape == (n_samples, len(xs), self.width)
@@ -2215,10 +2222,9 @@ def primed_sample(
             assert x.shape == (n_samples, 1)
             empty_cache()
             # for sample_t in get_range(range(len(xs), sample_tokens)):
-            total = (sample_tokens - len(xs))
-            task3 = progress.add_task("Sampling indivdual tokens (super slow)", total = total )
+            total = sample_tokens - len(xs)
+            task3 = progress.add_task("Sampling indivdual tokens ", total=total)
             for sample_t in range(len(xs), sample_tokens):
-
                 x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16)  # Transformer
@@ -2234,9 +2240,8 @@ def primed_sample(
                 x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
                 assert x.shape == (n_samples, 1)
                 xs.append(x.clone())
-                progress.update(task3,advance=1)
-                progress.update(0,advance=1/total)
-                
+                progress.update(task3, advance=1)
+                progress.update(0, advance=1 / total)
 
             del x
             self.transformer.del_cache()
@@ -3200,13 +3205,8 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
     return alignments
 
 
-
-
-
-
 from rich.live import Live
-from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn,TimeRemainingColumn,TimeElapsedColumn
-
+from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn
 
 
 progress = Progress(
@@ -3215,7 +3215,7 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
     BarColumn(),
     TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
     TimeElapsedColumn(),
-    TimeRemainingColumn()
+    TimeRemainingColumn(),
 )
 
 
@@ -3225,8 +3225,7 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
 )
 class JukeboxModel(JukeboxPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-   
-    
+
     def __init__(self, config):
         super().__init__(config)
 
@@ -3289,7 +3288,7 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         z_conds = prior.get_z_conds(zs, start, end)
         # if there are no levels above should return None!
 
-        # set y offset, sample_length and lyrics tokens
+        # set y offset, sample_length and lyrics okens
         y = prior.get_y(labels, start)
 
         empty_cache()
@@ -3320,7 +3319,9 @@ def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_len
         prior = self.priors[level]
         if total_length >= prior.n_ctx:
             with Live(progress):
-                progress.add_task("[red]Sampling single window...", total=len(get_starts(total_length, prior.n_ctx, hop_length)))
+                progress.add_task(
+                    "[red]Sampling single window...", total=len(get_starts(total_length, prior.n_ctx, hop_length))
+                )
                 for start in get_starts(total_length, prior.n_ctx, hop_length):
                     zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
                     # progress.update(task1, advance=1)
@@ -3344,7 +3345,7 @@ def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
             hop_length = int(hps.hop_fraction[-level - 1] * prior.n_ctx)
 
             zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
-            
+
             # TODO either mask them or ddo better
             # if level != len(sample_levels) - 1:
             #     labels_level = labels[level][0][: 4 + hps.max_bow_genre_size].unsqueeze(0)
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 7b2185a04887a..d081fe267915f 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -291,7 +291,7 @@ def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
     # a type argument to add an artist token with self.getattr('artist') ?
     # TODO : is a call function required ?
 
-    def __call__(self, artist, genres, lyrics, total_length, sample_length, offset, duration):
+    def __call__(self, artist, genres, lyrics, total_length, sample_length, offset):
         """Convert the raw string to token ids
 
         Args:
@@ -307,13 +307,11 @@ def __call__(self, artist, genres, lyrics, total_length, sample_length, offset,
                 _description_
             offset (`_type_`):
                 _description_
-            duration (`_type_`):
-                _description_
         """
         input_ids = [total_length, offset, sample_length]
         artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
         artists_id, genres_ids, lyric_ids = self._convert_token_to_id(
-            artists_tokens, genres_tokens, lyrics_tokens, total_length, offset, duration
+            artists_tokens, genres_tokens, lyrics_tokens, total_length, offset, sample_length
         )
         input_ids += [artists_id] + genres_ids + lyric_ids
         attention_masks = [-INFINITY] * (self.max_n_lyric_tokens - len(lyrics_tokens)) + [0] * len(lyrics_tokens)
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index d8bc88b295340..73af6b171222c 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -552,7 +552,7 @@ def test_gpu_sampling(self):
 
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox", max_n_lyric_tokens=384)
         set_seed(0)
-        
+
         sampling_temperature = 0.98
         lower_batch_size = 16
         max_batch_size = 16
@@ -598,11 +598,10 @@ def test_gpu_sampling(self):
 
         tokens = tokenizer(**metas)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-        zs = [torch.zeros(1,0) for _ in range(len(model.priors))]
-        labels = torch.tensor([[inputs]]*3)
-        zs = model._sample(zs,labels , sampling_kwargs, [2],model.config)
+        zs = [torch.zeros(1, 0) for _ in range(len(model.priors))]
+        labels = torch.tensor([[inputs]] * 3)
+        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
 
-        
         ys = np.array([[inputs]] * 3, dtype=np.int64)
         ys = torch.stack([torch.from_numpy(y) for y in ys], dim=0).long()  # .to("cuda")
 
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
index d26ef85fc816d..dc42e9a56e17e 100644
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -39,7 +39,7 @@ def test_tokenizer(self):
 
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
         tokenizer.max_n_lyric_tokens = 20
-        tokens = tokenizer("Alan Jackson", "rock", "old town road", 4 * 60 * 44100, 8192 * 8 * 4 * 4, 0, 1)
+        tokens = tokenizer("Alan Jackson", "rock", "old town road", 4 * 60 * 44100, 8192 * 8 * 4 * 4, 0)
         inputs, attention_masks = tokens["input_ids"], tokens["attention_masks"]
         EXPECTED_OUTPUT = [
             10584000,

From 29d26140a55a160fc7e47d88cbef10c85b988892 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Thu, 30 Jun 2022 13:08:01 +0000
Subject: [PATCH 010/196] update

---
 .../models/jukebox/modeling_jukebox.py        |  37 +++--
 .../models/jukebox/tokenization_jukebox.py    |  91 ++++++-------
 tests/models/jukebox/test_modeling_jukebox.py | 128 ++++++++++++++----
 .../jukebox/test_tokenization_jukebox.py      |   2 +-
 4 files changed, 168 insertions(+), 90 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 7b92f33b0b082..9f4035c60cd96 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -40,7 +40,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, logging
 from .configuration_jukebox import JukeboxConfig
-
+from .tokenization_jukebox import get_relevant_lyric_tokens
 
 logger = logging.get_logger(__name__)
 
@@ -705,9 +705,9 @@ def __init__(self, config):
         if not config.sample_length:
             downsamples = calculate_strides(config.vq_vae_strides_t, config.vq_vae_downs_t)
             top_raw_to_tokens = np.prod(downsamples)
-            config.sample_length = (
+            config.sample_length = ((
                 config.sample_length_in_seconds * config.sr // top_raw_to_tokens
-            ) * top_raw_to_tokens
+            ) * top_raw_to_tokens).astype(int)
 
         input_shape = (config.sample_length, 1)
         block_kwargs = dict(
@@ -2755,29 +2755,38 @@ def conditioner_block(_level):
         )
 
     def get_y(self, labels, start, get_indices=False):
-        # labeler does not exist this should be removed
-        # if isinstance(self.labeller, EmptyLabeller):
-        #     return None
-
-        # y = labels["y"].clone()
-        y = labels.clone()
+        y = labels["y"].clone()
+        # y = labels.clone()
 
         # Set sample_length to match this level
         y[:, 2] = int(self.sample_length)
 
         # Set offset
         y[:, 1:2] = y[:, 1:2] + int(start * self.raw_to_tokens)
-        if get_indices:
-            indices = None
-            return y, indices  # here the indices should be the indices of the lyrics to take into account...
-        return y
-        # Set lyric tokens
         indices = self.labeller.set_y_lyric_tokens(y, labels)
         if get_indices:
             return y, indices
         else:
             return y
 
+    def set_y_lyric_tokens(self, ys, labels):
+        info = labels['info']
+        assert ys.shape[0] == len(info)
+        if self.n_tokens > 0:
+            # total_length, offset, duration):
+            tokens_list = []
+            indices_list = []  # whats the index of each current character in original array
+            for i in range(ys.shape[0]):
+                full_tokens = info[i]['full_tokens']
+                total_length, offset, duration = ys[i, 0], ys[i, 1], ys[i, 2]
+                tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
+                tokens_list.append(tokens)
+                indices_list.append(indices)
+            ys[:, -self.n_tokens:] = t.tensor(tokens_list, dtype=t.long, device='cpu')
+            return indices_list
+        else:
+            return None
+
     def get_z_conds(self, zs, start, end):
         if self.level != self.levels - 1:
             assert start % self.cond_downsample == end % self.cond_downsample == 0
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index d081fe267915f..42654646af3e9 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -44,6 +44,41 @@
 }
 
 
+def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
+    """Extract only the relevant tokens based on the character position. A total of
+    `max_n_lyric_tokens` tokens will be returned. If the provided token sequence is smaller, it will be padded,
+    othewise, only characters ranging from the midpoint - `max_n_lyric_tokens//2` to the midpoint +
+    `max_n_lyric_tokens//2` will be returned. This *focuses* on the most relevant tokens (in time) for the
+    sequence.
+
+    Args: # TODO : args to prettify
+        full_tokens (`_type_`):
+            _description_
+        total_length (`_type_`):
+            _description_
+        offset (`_type_`):
+            _description_
+        duration (`_type_`):
+            _description_
+    """
+    if len(full_tokens) < max_n_lyric_tokens:
+        tokens = [0] * (max_n_lyric_tokens - len(full_tokens)) + full_tokens
+        indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
+    else:
+        assert 0 <= offset < total_length
+        midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
+        midpoint = min(
+            max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2
+        )
+        tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2]
+        indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
+    assert len(tokens) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(tokens)}"
+    assert (
+        len(indices) == max_n_lyric_tokens
+    ), f"Expected length {max_n_lyric_tokens}, got {len(indices)}"
+    assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
+    return tokens
+
 class JukeboxTokenizer(PreTrainedTokenizer):
     """
     Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
@@ -100,13 +135,12 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     max_lyric_input_size = PRETRAINED_LYRIC_TOKENS_SIZES
     model_input_names = ["input_ids", "attention_mask"]
 
-    def __init__(self, vocab_file, max_n_lyric_tokens=512, n_genres=5, unk_token="<|endoftext|>", **kwargs):
+    def __init__(self, vocab_file, n_genres=5, unk_token="<|endoftext|>", **kwargs):
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         super().__init__(
             unk_token=unk_token,
             **kwargs,
         )
-        self.max_n_lyric_tokens = max_n_lyric_tokens
         self.n_genres = n_genres
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
@@ -128,41 +162,6 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.artists_encoder, self.genres_encoder, self.lyrics_encoder)
 
-    def get_relevant_lyric_tokens(self, full_tokens, total_length, offset, duration):
-        """Extract only the relevant tokens based on the character position. A total of
-        `max_n_lyric_tokens` tokens will be returned. If the provided token sequence is smaller, it will be padded,
-        othewise, only characters ranging from the midpoint - `max_n_lyric_tokens//2` to the midpoint +
-        `max_n_lyric_tokens//2` will be returned. This *focuses* on the most relevant tokens (in time) for the
-        sequence.
-
-        Args: # TODO : args to prettify
-            full_tokens (`_type_`):
-                _description_
-            total_length (`_type_`):
-                _description_
-            offset (`_type_`):
-                _description_
-            duration (`_type_`):
-                _description_
-        """
-        if len(full_tokens) < self.max_n_lyric_tokens:
-            tokens = [0] * (self.max_n_lyric_tokens - len(full_tokens)) + full_tokens
-            indices = [-1] * (self.max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
-        else:
-            assert 0 <= offset < total_length
-            midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
-            midpoint = min(
-                max(midpoint, self.max_n_lyric_tokens // 2), len(full_tokens) - self.max_n_lyric_tokens // 2
-            )
-            tokens = full_tokens[midpoint - self.max_n_lyric_tokens // 2 : midpoint + self.max_n_lyric_tokens // 2]
-            indices = list(range(midpoint - self.max_n_lyric_tokens // 2, midpoint + self.max_n_lyric_tokens // 2))
-        assert len(tokens) == self.max_n_lyric_tokens, f"Expected length {self.max_n_lyric_tokens}, got {len(tokens)}"
-        assert (
-            len(indices) == self.max_n_lyric_tokens
-        ), f"Expected length {self.max_n_lyric_tokens}, got {len(indices)}"
-        assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
-        return tokens
-
     def _convert_token_to_id(self, artist, genres, lyrics, total_length, offset, duration):
         """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
         The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
@@ -185,8 +184,8 @@ def _convert_token_to_id(self, artist, genres, lyrics, total_length, offset, dur
         artists_id = self.artists_encoder.get(artist)
         genres_ids = [self.genres_encoder.get(genre) for genre in genres]
         lyric_ids = [self.lyrics_encoder.get(character) for character in lyrics]
-        lyric_ids = self.get_relevant_lyric_tokens(lyric_ids, total_length, offset, duration)
-        return artists_id, genres_ids, lyric_ids
+        y = self.get_relevant_lyric_tokens(lyric_ids, total_length, offset, duration)
+        return artists_id, genres_ids, y, lyric_ids
 
     def _tokenize(self, lyrics):
         """
@@ -243,8 +242,8 @@ def prepare_for_tokenization(
         """
         artist = self._normalize(artist)
         genres = self._normalize(genres).split("_")  # split is for the full dictionnary with combined genres
-
-        lyrics = normalizers.BertNormalizer().normalize_str(lyrics)
+        normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.StripAccents()])
+        lyrics = normalizer.normalize_str(lyrics)
         lyrics = lyrics.replace("\\", "\n")
         lyrics = self.out_of_vocab.sub("", lyrics)
         return artist, genres, lyrics, kwargs
@@ -310,12 +309,12 @@ def __call__(self, artist, genres, lyrics, total_length, sample_length, offset):
         """
         input_ids = [total_length, offset, sample_length]
         artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
-        artists_id, genres_ids, lyric_ids = self._convert_token_to_id(
-            artists_tokens, genres_tokens, lyrics_tokens, total_length, offset, sample_length
+        artists_id, genres_ids, relevant_tokens, full_tokens = self._convert_token_to_id(
+            artists_tokens, genres_tokens, relevant_tokens, total_length, offset, sample_length
         )
-        input_ids += [artists_id] + genres_ids + lyric_ids
-        attention_masks = [-INFINITY] * (self.max_n_lyric_tokens - len(lyrics_tokens)) + [0] * len(lyrics_tokens)
-        return {"input_ids": input_ids, "attention_masks": attention_masks}
+        input_ids += [artists_id] + genres_ids + y
+        attention_masks = [-INFINITY] * (len(full_tokens) - len(relevant_tokens)) + [0] * len(relevant_tokens)
+        return {"input_ids": {'y': input_ids, 'full_tokens':full_tokens}, "attention_masks": attention_masks}
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 73af6b171222c..d60d2829fb8e0 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -545,7 +545,7 @@ def test_model(self):
 
         self.assertTrue(torch.allclose(zs[0][0][0:50], top_50_expected_zs.long(), atol=1e-4))
 
-    def test_gpu_sampling(self):
+    def test_1b_lyrics(self):
         model = JukeboxModel.from_pretrained("ArthurZ/jukebox-1b-lyrics").eval()  # .to("cuda")
 
         # model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 386 * [0]]).long().to("cuda"), chunk_size=32)
@@ -567,50 +567,120 @@ def test_gpu_sampling(self):
         model.config.sr = 44100
         model.config.hop_fraction = [0.125, 0.5, 0.5]
         model.config.n_samples = 1
-        model.config.sample_length = 2645888  # 32768
 
-        model.config.sample_length_in_seconds = 60
+        model.config.sample_length_in_seconds = 20
+        model.config.sample_length = (int(model.config.sample_length_in_seconds*model.config.sr)//
+                                    model.priors[-1].raw_to_tokens)*model.priors[-1].raw_to_tokens
+
         model.config.total_sample_length_in_seconds = 180
 
         metas = dict(
             artist="Zac Brown Band",
             genres="Country",
-            total_length=2645888,
+            total_length=440960,
             offset=0,
             lyrics="""I met a traveller from an antique land,
-            Who said—“Two vast and trunkless legs of stone
-            Stand in the desert. . . . Near them, on the sand,
-            Half sunk a shattered visage lies, whose frown,
-            And wrinkled lip, and sneer of cold command,
-            Tell that its sculptor well those passions read
-            Which yet survive, stamped on these lifeless things,
-            The hand that mocked them, and the heart that fed;
-            And on the pedestal, these words appear:
-            My name is Ozymandias, King of Kings;
-            Look on my Works, ye Mighty, and despair!
-            Nothing beside remains. Round the decay
-            Of that colossal Wreck, boundless and bare
-            The lone and level sands stretch far away
-            """,
+Who said—“Two vast and trunkless legs of stone
+Stand in the desert. . . . Near them, on the sand,
+Half sunk a shattered visage lies, whose frown,
+And wrinkled lip, and sneer of cold command,
+Tell that its sculptor well those passions read
+Which yet survive, stamped on these lifeless things,
+The hand that mocked them, and the heart that fed;
+And on the pedestal, these words appear:
+My name is Ozymandias, King of Kings;
+Look on my Works, ye Mighty, and despair!
+Nothing beside remains. Round the decay
+Of that colossal Wreck, boundless and bare
+The lone and level sands stretch far away
+""",
             duration=2,
-            sample_length=2 * model.config.sr,
+            sample_length=786432,
         )
 
         tokens = tokenizer(**metas)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-        zs = [torch.zeros(1, 0) for _ in range(len(model.priors))]
-        labels = torch.tensor([[inputs]] * 3)
-        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        
+        zs = [torch.zeros(1,0,dtype = torch.long).cpu() for _ in range(len(model.priors))] 
+        labels = torch.tensor([[inputs]]*3).cpu()
+        # model = model.cuda()
 
-        ys = np.array([[inputs]] * 3, dtype=np.int64)
-        ys = torch.stack([torch.from_numpy(y) for y in ys], dim=0).long()  # .to("cuda")
+        set_seed(0)
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.enabled = False
+        zs = model._sample(zs,labels , sampling_kwargs, [2],model.config)
 
-        start = timeit.default_timer()
-        # import cProfile as profile
-        # profile.runctx('model.ancestral_sample(ys, sampling_kwargs, config)', globals(), locals())
-        model.ancestral_sample(ys, sampling_kwargs, model.config)
-        print(f"time to sample : {timeit.default_timer() - start}")
+        EXPECTED_OUTPUT = torch.tensor([1489, 1489,  324, 1489, 1599, 1072, 1357, 1489,  784, 1272])
+
+        labels[1] =torch.tensor([2645888,       0,  262144,    1069,      11 ] + [-1]*384 ).cpu()
+        zs[-1] = torch.cat((zs[-1], torch.zeros(1,1848).cpu()),dim=-1)
+        zs = model._sample(zs,labels , sampling_kwargs, [1],model.config)
+
+
+        
+    def test_5b_lyrics(self):
+        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-5b-lyrics").eval()  # .to("cuda")
+        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox", max_n_lyric_tokens=512)
+        set_seed(0)
+
+        sampling_temperature = 0.98
+        lower_batch_size = 16
+        max_batch_size = 16
+        lower_level_chunk_size = 32
+        chunk_size = 32
+        sampling_kwargs = [
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
+            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size),
+        ]
+
+        model.config.sr = 44100
+        model.config.hop_fraction = [0.125, 0.5, 0.5]
+        model.config.n_samples = 1
+
+        model.config.sample_length_in_seconds = 20
+        model.config.sample_length = (int(model.config.sample_length_in_seconds*model.config.sr)//
+                                    model.priors[-1].raw_to_tokens)*model.priors[-1].raw_to_tokens
+
+        model.config.total_sample_length_in_seconds = 180
+
+        metas = dict(
+            artist="Zac Brown Band",
+            genres="Country",
+            total_length=440960,
+            offset=0,
+            lyrics="""I met a traveller from an antique land,
+Who said—“Two vast and trunkless legs of stone
+Stand in the desert. . . . Near them, on the sand,
+Half sunk a shattered visage lies, whose frown,
+And wrinkled lip, and sneer of cold command,
+Tell that its sculptor well those passions read
+Which yet survive, stamped on these lifeless things,
+The hand that mocked them, and the heart that fed;
+And on the pedestal, these words appear:
+My name is Ozymandias, King of Kings;
+Look on my Works, ye Mighty, and despair!
+Nothing beside remains. Round the decay
+Of that colossal Wreck, boundless and bare
+The lone and level sands stretch far away
+""",
+            duration=2,
+            sample_length=786432,
+        )
+
+        tokens = tokenizer(**metas)
+        inputs, _ = tokens["input_ids"], tokens["attention_masks"]
+        
+        zs = [torch.zeros(1,0,dtype = torch.long).cpu() for _ in range(len(model.priors))] 
+        labels = torch.tensor([[inputs]]*3).cpu()
+        # model = model.cuda()
+
+        set_seed(0)
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.enabled = False
+        zs = model._sample(zs,labels , sampling_kwargs, [2],model.config)
 
+        EXPECTED_OUTPUT = []
 
 if __name__ == "__main__":
     tester = JukeboxModelTest()
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
index dc42e9a56e17e..6b50b10251b80 100644
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -40,7 +40,7 @@ def test_tokenizer(self):
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
         tokenizer.max_n_lyric_tokens = 20
         tokens = tokenizer("Alan Jackson", "rock", "old town road", 4 * 60 * 44100, 8192 * 8 * 4 * 4, 0)
-        inputs, attention_masks = tokens["input_ids"], tokens["attention_masks"]
+        inputs, attention_masks = tokens["input_ids"]['y'], tokens["attention_masks"]
         EXPECTED_OUTPUT = [
             10584000,
             0,

From 81afaadc23453ce6a01ecf782b3299f6096129e3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Thu, 30 Jun 2022 14:44:21 +0000
Subject: [PATCH 011/196] update code

---
 .../models/jukebox/modeling_jukebox.py            | 15 +++++++++------
 .../models/jukebox/tokenization_jukebox.py        | 14 ++++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 9f4035c60cd96..118880dd0f62f 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2086,12 +2086,14 @@ def sample(
             if get_preds:
                 preds = []
             # for sample_t in get_range(range(0, sample_tokens)):
+            total = sample_tokens - len(xs)
+            task3 = progress.add_task("Sampling indivdual tokens ", total=total)
             for sample_t in range(0, sample_tokens):
 
                 x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 x = self.transformer(
-                    x, encoder_kv=encoder_kv, sample=True, fp16=True
+                    x, encoder_kv=encoder_kv, sample=True, fp16=fp16
                 )  # TODO put fp16 back # Transformer
                 if self.add_cond_after_transformer:
                     x = x + cond
@@ -2105,6 +2107,8 @@ def sample(
                 x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
                 assert x.shape == (n_samples, 1)
                 xs.append(x.clone())
+                progress.update(task3, advance=1)
+                progress.update(0, advance=1 / total)
 
             del x
             self.transformer.del_cache()
@@ -2763,26 +2767,25 @@ def get_y(self, labels, start, get_indices=False):
 
         # Set offset
         y[:, 1:2] = y[:, 1:2] + int(start * self.raw_to_tokens)
-        indices = self.labeller.set_y_lyric_tokens(y, labels)
+        indices = self.set_y_lyric_tokens(y, labels)
         if get_indices:
             return y, indices
         else:
             return y
 
     def set_y_lyric_tokens(self, ys, labels):
-        info = labels['info']
-        assert ys.shape[0] == len(info)
+        # assert ys.shape[0] == len(labels)
         if self.n_tokens > 0:
             # total_length, offset, duration):
             tokens_list = []
             indices_list = []  # whats the index of each current character in original array
             for i in range(ys.shape[0]):
-                full_tokens = info[i]['full_tokens']
+                full_tokens = labels['full_tokens']
                 total_length, offset, duration = ys[i, 0], ys[i, 1], ys[i, 2]
                 tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
                 tokens_list.append(tokens)
                 indices_list.append(indices)
-            ys[:, -self.n_tokens:] = t.tensor(tokens_list, dtype=t.long, device='cpu')
+            ys[:, -self.n_tokens:] = torch.tensor(tokens_list, dtype=torch.long, device='cpu')
             return indices_list
         else:
             return None
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 42654646af3e9..ed78bf60175b8 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -77,7 +77,7 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
         len(indices) == max_n_lyric_tokens
     ), f"Expected length {max_n_lyric_tokens}, got {len(indices)}"
     assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
-    return tokens
+    return tokens, indices
 
 class JukeboxTokenizer(PreTrainedTokenizer):
     """
@@ -135,14 +135,14 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     max_lyric_input_size = PRETRAINED_LYRIC_TOKENS_SIZES
     model_input_names = ["input_ids", "attention_mask"]
 
-    def __init__(self, vocab_file, n_genres=5, unk_token="<|endoftext|>", **kwargs):
+    def __init__(self, vocab_file, n_genres=5, max_n_lyric_tokens = 512, unk_token="<|endoftext|>", **kwargs):
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         super().__init__(
             unk_token=unk_token,
             **kwargs,
         )
         self.n_genres = n_genres
-
+        self.max_n_lyric_tokens = max_n_lyric_tokens
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             vocabulary = json.load(vocab_handle)
             self.artists_encoder = vocabulary["artists"]
@@ -183,8 +183,9 @@ def _convert_token_to_id(self, artist, genres, lyrics, total_length, offset, dur
         """
         artists_id = self.artists_encoder.get(artist)
         genres_ids = [self.genres_encoder.get(genre) for genre in genres]
+        genres_ids = genres_ids + [-1] * (self.n_genres - len(genres_ids))
         lyric_ids = [self.lyrics_encoder.get(character) for character in lyrics]
-        y = self.get_relevant_lyric_tokens(lyric_ids, total_length, offset, duration)
+        y,_ = get_relevant_lyric_tokens(lyric_ids, self.max_n_lyric_tokens,total_length, offset, duration)
         return artists_id, genres_ids, y, lyric_ids
 
     def _tokenize(self, lyrics):
@@ -306,13 +307,14 @@ def __call__(self, artist, genres, lyrics, total_length, sample_length, offset):
                 _description_
             offset (`_type_`):
                 _description_
+            max_n_lyric_tokens (`int`): 
         """
         input_ids = [total_length, offset, sample_length]
         artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
         artists_id, genres_ids, relevant_tokens, full_tokens = self._convert_token_to_id(
-            artists_tokens, genres_tokens, relevant_tokens, total_length, offset, sample_length
+            artists_tokens, genres_tokens, lyrics_tokens, total_length, offset, sample_length
         )
-        input_ids += [artists_id] + genres_ids + y
+        input_ids += [artists_id] + genres_ids + relevant_tokens
         attention_masks = [-INFINITY] * (len(full_tokens) - len(relevant_tokens)) + [0] * len(relevant_tokens)
         return {"input_ids": {'y': input_ids, 'full_tokens':full_tokens}, "attention_masks": attention_masks}
 

From 7d361d8187e4adc0f96181071960945e18376e44 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Thu, 30 Jun 2022 16:55:36 +0000
Subject: [PATCH 012/196] update tokenizer doc

---
 .../models/jukebox/tokenization_jukebox.py    | 61 +++++++++++--------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index ed78bf60175b8..870de66691bd6 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -26,6 +26,7 @@
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
+import torch
 
 logger = logging.get_logger(__name__)
 
@@ -45,21 +46,24 @@
 
 
 def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
-    """Extract only the relevant tokens based on the character position. A total of
+    """
+    Extract only the relevant tokens based on the character position. A total of
     `max_n_lyric_tokens` tokens will be returned. If the provided token sequence is smaller, it will be padded,
     othewise, only characters ranging from the midpoint - `max_n_lyric_tokens//2` to the midpoint +
     `max_n_lyric_tokens//2` will be returned. This *focuses* on the most relevant tokens (in time) for the
     sequence.
 
     Args: # TODO : args to prettify
-        full_tokens (`_type_`):
-            _description_
-        total_length (`_type_`):
-            _description_
-        offset (`_type_`):
-            _description_
-        duration (`_type_`):
-            _description_
+        full_tokens (`List[int]`):
+            List containing the ids of the entire lyrics. 
+        total_length (`int`):
+            Total expected length of the music (not all of it is generated, see duration), in samples. 
+        offset (`int`):
+            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted 
+            take that into account
+        duration (`int`):
+            Expected duration of the generated music, in seconds. The duration has to be smaller than the 
+            total lenght, which represent the overall length of the signal, 
     """
     if len(full_tokens) < max_n_lyric_tokens:
         tokens = [0] * (max_n_lyric_tokens - len(full_tokens)) + full_tokens
@@ -121,10 +125,10 @@ class JukeboxTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             Path to the vocabulary file which should contain a dictionnary where the keys are 'artist', 'genre' and
             'lyrics' and the values are their corresponding vocabulary files.
-        max_n_lyric_tokens (`int`, `optional`, defaults to 512):
-            Maximum number of lyric tokens to keep.
         n_genres (`int`, `optional`, defaults to 1):
             Maximum number of genres to use for composition.
+        max_n_lyric_tokens (`int`, `optional`, defaults to 512):
+            Maximum number of lyric tokens to keep.
         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
@@ -272,12 +276,12 @@ def _normalize(self, text: str) -> str:
     def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
         """Converts an index (integer) in a token (str) using the vocab.
         Args:
-            artists_index (`_type_`):
-                _description_
-            genres_index (`_type_`):
-                _description_
-            lyric_index (`_type_`):
-                _description_
+            artists_index (`int`):
+                Index of the artist in its corresponding dictionnary. 
+            genres_index (`Union[List[int], int]`):
+               Index of the genre in its corresponding dictionnary. 
+            lyric_index (`List[int]`):
+                List of character indices, which each correspond to a character. 
         """
         artist = self.artists_decoder.get(artists_index)
         genres = [self.genres_decoder.get(genre) for genre in genres_index]
@@ -289,25 +293,26 @@ def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
 
     # TODO : should add_token be implemeted for artists, genres and lyrics? Should it have
     # a type argument to add an artist token with self.getattr('artist') ?
-    # TODO : is a call function required ?
 
-    def __call__(self, artist, genres, lyrics, total_length, sample_length, offset):
-        """Convert the raw string to token ids
+    def __call__(self, artist, genres, lyrics, total_length, sample_length, offset, return_tensor = "pt"):
+        """
+        Convert the raw string to token ids
 
         Args:
-            artist (`_type_`):
+            artist (`str`):
                 _description_
-            genre (`_type_`):
+            genre (`str`):
                 _description_
-            lyrics (`_type_`):
+            lyrics (`srt`):
                 _description_
-            total_length (`_type_`):
+            total_length (`int`):
                 _description_
-            sample_length (`_type_`):
+            sample_length (`int`):
                 _description_
-            offset (`_type_`):
+            offset (`int`):
                 _description_
             max_n_lyric_tokens (`int`): 
+                _description_
         """
         input_ids = [total_length, offset, sample_length]
         artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
@@ -316,7 +321,9 @@ def __call__(self, artist, genres, lyrics, total_length, sample_length, offset):
         )
         input_ids += [artists_id] + genres_ids + relevant_tokens
         attention_masks = [-INFINITY] * (len(full_tokens) - len(relevant_tokens)) + [0] * len(relevant_tokens)
-        return {"input_ids": {'y': input_ids, 'full_tokens':full_tokens}, "attention_masks": attention_masks}
+        # TODO properly handle the return pt tensor option
+        if return_tensor == "pt":
+            return {"input_ids": {'y': torch.tensor([input_ids]), 'full_tokens':full_tokens}, "attention_masks":torch.tensor( attention_masks)}
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """

From 1d4d0d2643dc01660f11963e1c852c536eed73ce Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Fri, 1 Jul 2022 14:43:53 +0000
Subject: [PATCH 013/196] only CPU run for now need to clean and handle device
 properly

---
 .../models/jukebox/modeling_jukebox.py             | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 118880dd0f62f..2703755bcd0c7 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -410,10 +410,10 @@ def reset_k(self):
         self.k_elem = None
         # self.register_buffer('k',  torch.zeros(self.k_bins, self.emb_width).cuda())
 
-        if torch.cuda.is_available():
-            self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width).to("cuda"))
-        else:
-            self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width))
+        # if torch.cuda.is_available():
+        #     self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width).to("cuda"))
+        # else:
+        self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width))
 
     def _tile(self, x):
         d, ew = x.shape
@@ -827,7 +827,7 @@ def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
         return zs
 
     def sample(self, n_samples):
-        zs = [torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cuda") for z_shape in self.z_shapes]
+        zs = [torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes]
         return self.decode(zs)
 
     def forward(self, x, hps, loss_fn="l1"):
@@ -2078,7 +2078,7 @@ def sample(
         else:
             assert x_cond is None
             x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(
-                "cuda" if torch.cuda.is_available() else "cpu"
+                "cpu" if torch.cuda.is_available() else "cpu"
             )
 
         with torch.no_grad():
@@ -3336,7 +3336,7 @@ def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_len
                 )
                 for start in get_starts(total_length, prior.n_ctx, hop_length):
                     zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
-                    # progress.update(task1, advance=1)
+
         else:
             zs = self.sample_partial_window(zs, labels, sampling_kwargs, level, total_length, hps)
         return zs

From 965b2dc0177e6907479baa79f6946b63882922bd Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Mon, 4 Jul 2022 12:08:32 +0000
Subject: [PATCH 014/196] update tokenizer to support v3 dictionnary

---
 .../models/jukebox/tokenization_jukebox.py    | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 870de66691bd6..94eebd3c2f3ae 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -152,8 +152,13 @@ def __init__(self, vocab_file, n_genres=5, max_n_lyric_tokens = 512, unk_token="
             self.artists_encoder = vocabulary["artists"]
             self.genres_encoder = vocabulary["genres"]
             self.lyrics_encoder = vocabulary["lyrics"]
-
-        self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")  # FIXME: should be an argument?
+        
+        oov = "[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]"
+        # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
+        if len(self.lyrics_encoder)== 79:
+            oov += "+" 
+            
+        self.out_of_vocab = re.compile(oov)  # FIXME: should be an argument?
 
         self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
         self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
@@ -262,12 +267,14 @@ def _normalize(self, text: str) -> str:
         """
         import re
 
-        accepted = frozenset(
-            [chr(i) for i in range(ord("a"), ord("z") + 1)]
-            + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
+        accepted = [chr(i) for i in range(ord("a"), ord("z") + 1)] \
+            + [chr(i) for i in range(ord("A"), ord("Z") + 1)] \
             + [chr(i) for i in range(ord("0"), ord("9") + 1)]
-        )
 
+        # In v2, " " is not accepted while it is for v3 
+        if len(self.lyrics_encoder)== 79:
+            accepted += [" "]
+        accepted = frozenset(accepted) 
         rex = re.compile(r"_+")
         text = "".join([c if c in accepted else "_" for c in text.lower()])
         text = rex.sub("_", text).strip("_")

From df30e00e74d275d6285e541513ddd731361a4d95 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Mon, 4 Jul 2022 12:08:45 +0000
Subject: [PATCH 015/196] update tests

---
 tests/models/jukebox/test_modeling_jukebox.py | 408 ++++--------------
 1 file changed, 91 insertions(+), 317 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index d60d2829fb8e0..ccf0cd507fd5b 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -38,6 +38,29 @@
 class JukeboxModelTest(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
 
+    metas = dict(
+            artist="Zac Brown Band",
+            genres="Country",
+            total_length=440960,
+            offset=0,
+            lyrics="""I met a traveller from an antique land,
+    Who said—“Two vast and trunkless legs of stone
+    Stand in the desert. . . . Near them, on the sand,
+    Half sunk a shattered visage lies, whose frown,
+    And wrinkled lip, and sneer of cold command,
+    Tell that its sculptor well those passions read
+    Which yet survive, stamped on these lifeless things,
+    The hand that mocked them, and the heart that fed;
+    And on the pedestal, these words appear:
+    My name is Ozymandias, King of Kings;
+    Look on my Works, ye Mighty, and despair!
+    Nothing beside remains. Round the decay
+    Of that colossal Wreck, boundless and bare
+    The lone and level sands stretch far away
+    """,
+            duration=2,
+            sample_length=786432,
+        )
     # @slow
     def test_model(self):
         set_seed(0)
@@ -545,13 +568,19 @@ def test_model(self):
 
         self.assertTrue(torch.allclose(zs[0][0][0:50], top_50_expected_zs.long(), atol=1e-4))
 
+    def test_conditioning(self):
+        pass
+        # x,x_conds and y_conds should be the same  before calling the sampling
+        # start and end embeding 
+        # expected conditioning to match
+
     def test_1b_lyrics(self):
         model = JukeboxModel.from_pretrained("ArthurZ/jukebox-1b-lyrics").eval()  # .to("cuda")
 
         # model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 386 * [0]]).long().to("cuda"), chunk_size=32)
 
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox", max_n_lyric_tokens=384)
-        set_seed(0)
+
 
         sampling_temperature = 0.98
         lower_batch_size = 16
@@ -564,46 +593,15 @@ def test_1b_lyrics(self):
             dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size),
         ]
 
-        model.config.sr = 44100
-        model.config.hop_fraction = [0.125, 0.5, 0.5]
-        model.config.n_samples = 1
-
-        model.config.sample_length_in_seconds = 20
-        model.config.sample_length = (int(model.config.sample_length_in_seconds*model.config.sr)//
-                                    model.priors[-1].raw_to_tokens)*model.priors[-1].raw_to_tokens
 
-        model.config.total_sample_length_in_seconds = 180
-
-        metas = dict(
-            artist="Zac Brown Band",
-            genres="Country",
-            total_length=440960,
-            offset=0,
-            lyrics="""I met a traveller from an antique land,
-Who said—“Two vast and trunkless legs of stone
-Stand in the desert. . . . Near them, on the sand,
-Half sunk a shattered visage lies, whose frown,
-And wrinkled lip, and sneer of cold command,
-Tell that its sculptor well those passions read
-Which yet survive, stamped on these lifeless things,
-The hand that mocked them, and the heart that fed;
-And on the pedestal, these words appear:
-My name is Ozymandias, King of Kings;
-Look on my Works, ye Mighty, and despair!
-Nothing beside remains. Round the decay
-Of that colossal Wreck, boundless and bare
-The lone and level sands stretch far away
-""",
-            duration=2,
-            sample_length=786432,
-        )
+        self.metas.sample_length=model.priors[-1].sample_length
 
-        tokens = tokenizer(**metas)
+        tokens = tokenizer(**self.metas)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
         
         zs = [torch.zeros(1,0,dtype = torch.long).cpu() for _ in range(len(model.priors))] 
-        labels = torch.tensor([[inputs]]*3).cpu()
-        # model = model.cuda()
+        labels = [{},{}, inputs]
+
 
         set_seed(0)
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -611,16 +609,38 @@ def test_1b_lyrics(self):
         zs = model._sample(zs,labels , sampling_kwargs, [2],model.config)
 
         EXPECTED_OUTPUT = torch.tensor([1489, 1489,  324, 1489, 1599, 1072, 1357, 1489,  784, 1272])
+        
+        # TODO generate the original outputs 
+        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
+         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
+         653, 1434, 1434, 1536, 1599,  710])
+        assert torch.allclose(zs[-1][0,:30],EXPECTED_OUTPUT)
+
 
-        labels[1] =torch.tensor([2645888,       0,  262144,    1069,      11 ] + [-1]*384 ).cpu()
-        zs[-1] = torch.cat((zs[-1], torch.zeros(1,1848).cpu()),dim=-1)
+        labels[1]['y']= inputs['y'][:,:9]
+        labels[0]['y']= inputs['y'][:,:9]
+
+        zs[-1] = torch.cat((zs[-1], torch.zeros(1,2048-zs[-1].shape[-1]).cpu()),dim=-1)
         zs = model._sample(zs,labels , sampling_kwargs, [1],model.config)
+        # TODO find the expected outputs 
+        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
+         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
+         653, 1434, 1434, 1536, 1599,  710])
+        assert torch.allclose(zs[-2][0,:30],EXPECTED_OUTPUT)
+
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1,2048-zs[-1].shape[-1]).cpu()),dim=-1)
+        zs = model._sample(zs,labels , sampling_kwargs, [0],model.config)
+        # TODO find the expected outputs 
+        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
+         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
+         653, 1434, 1434, 1536, 1599,  710])
+        assert torch.allclose(zs[0][0,:30],EXPECTED_OUTPUT)
 
 
         
     def test_5b_lyrics(self):
-        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-5b-lyrics").eval()  # .to("cuda")
-        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox", max_n_lyric_tokens=512)
+        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-5b-lyrics").eval() 
+        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox-5b-lyrics")
         set_seed(0)
 
         sampling_temperature = 0.98
@@ -629,50 +649,16 @@ def test_5b_lyrics(self):
         lower_level_chunk_size = 32
         chunk_size = 32
         sampling_kwargs = [
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
-            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=30),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=30),
+            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size, sample_tokens=30),
         ]
 
-        model.config.sr = 44100
-        model.config.hop_fraction = [0.125, 0.5, 0.5]
-        model.config.n_samples = 1
-
-        model.config.sample_length_in_seconds = 20
-        model.config.sample_length = (int(model.config.sample_length_in_seconds*model.config.sr)//
-                                    model.priors[-1].raw_to_tokens)*model.priors[-1].raw_to_tokens
-
-        model.config.total_sample_length_in_seconds = 180
-
-        metas = dict(
-            artist="Zac Brown Band",
-            genres="Country",
-            total_length=440960,
-            offset=0,
-            lyrics="""I met a traveller from an antique land,
-Who said—“Two vast and trunkless legs of stone
-Stand in the desert. . . . Near them, on the sand,
-Half sunk a shattered visage lies, whose frown,
-And wrinkled lip, and sneer of cold command,
-Tell that its sculptor well those passions read
-Which yet survive, stamped on these lifeless things,
-The hand that mocked them, and the heart that fed;
-And on the pedestal, these words appear:
-My name is Ozymandias, King of Kings;
-Look on my Works, ye Mighty, and despair!
-Nothing beside remains. Round the decay
-Of that colossal Wreck, boundless and bare
-The lone and level sands stretch far away
-""",
-            duration=2,
-            sample_length=786432,
-        )
-
-        tokens = tokenizer(**metas)
+        tokens = tokenizer(**self.metas)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
         
         zs = [torch.zeros(1,0,dtype = torch.long).cpu() for _ in range(len(model.priors))] 
-        labels = torch.tensor([[inputs]]*3).cpu()
+        labels = [{}, {}, inputs]
         # model = model.cuda()
 
         set_seed(0)
@@ -680,7 +666,32 @@ def test_5b_lyrics(self):
         torch.backends.cudnn.enabled = False
         zs = model._sample(zs,labels , sampling_kwargs, [2],model.config)
 
-        EXPECTED_OUTPUT = []
+        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
+         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
+         653, 1434, 1434, 1536, 1599,  710])
+        assert torch.allclose(zs[-1][0,:30],EXPECTED_OUTPUT)
+
+
+        labels[1]['y']= inputs['y'][:,:9]
+        labels[0]['y']= inputs['y'][:,:9]
+
+        zs[-1] = torch.cat((zs[-1], torch.zeros(1,2048-zs[-1].shape[-1]).cpu()),dim=-1)
+        zs = model._sample(zs,labels , sampling_kwargs, [1],model.config)
+        # TODO find the expected outputs 
+        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
+         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
+         653, 1434, 1434, 1536, 1599,  710])
+        assert torch.allclose(zs[-2][0,:30],EXPECTED_OUTPUT)
+
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1,2048-zs[-1].shape[-1]).cpu()),dim=-1)
+        zs = model._sample(zs,labels , sampling_kwargs, [0],model.config)
+        # TODO find the expected outputs 
+        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
+         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
+         653, 1434, 1434, 1536, 1599,  710])
+        assert torch.allclose(zs[0][0,:30],EXPECTED_OUTPUT)
+
+
 
 if __name__ == "__main__":
     tester = JukeboxModelTest()
@@ -1167,246 +1178,9 @@ def test_5b_lyrics(self):
 #         config_and_inputs = self.model_tester.prepare_config_and_inputs()
 #         self.model_tester.create_and_check_jukebox_weight_initialization(*config_and_inputs)
 
-#     @slow
-#     def test_batch_generation(self):
-#         model = JukeboxLMHeadModel.from_pretrained("jukebox")
-#         model.to(torch_device)
-#         tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
-
-#         tokenizer.padding_side = "left"
-
-#         # Define PAD Token = EOS Token = 50256
-#         tokenizer.pad_token = tokenizer.eos_token
-#         model.config.pad_token_id = model.config.eos_token_id
-
-#         # use different length sentences to test batching
-#         sentences = [
-#             "Hello, my dog is a little",
-#             "Today, I",
-#         ]
-
-#         inputs = tokenizer(sentences, return_tensors="pt", padding=True)
-#         input_ids = inputs["input_ids"].to(torch_device)
-#         token_type_ids = torch.cat(
-#             [
-#                 input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
-#                 input_ids.new_full((input_ids.shape[0], 1), 500),
-#             ],
-#             dim=-1,
-#         )
-
-#         outputs = model.generate(
-#             input_ids=input_ids,
-#             attention_mask=inputs["attention_mask"].to(torch_device),
-#         )
-
-#         outputs_tt = model.generate(
-#             input_ids=input_ids,
-#             attention_mask=inputs["attention_mask"].to(torch_device),
-#             token_type_ids=token_type_ids,
-#         )
-
-#         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-#         output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-#         num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
-#         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-#         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-#         batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-#         batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-#         non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-#         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-#         expected_output_sentence = [
-#             "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
-#             "Today, I'm going to be doing a lot of research on this. I",
-#         ]
-#         self.assertListEqual(expected_output_sentence, batch_out_sentence)
-#         self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-#         self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
-#     @slow
-#     def test_batch_generation_2heads(self):
-#         model = JukeboxDoubleHeadsModel.from_pretrained("jukebox")
-#         model.to(torch_device)
-#         tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
-
-#         tokenizer.padding_side = "left"
-
-#         # This tokenizer has no pad token, so we have to set it in some way
-#         # Define PAD Token = EOS Token = 50256
-#         tokenizer.pad_token = tokenizer.eos_token
-#         model.config.pad_token_id = model.config.eos_token_id
-
-#         # use different length sentences to test batching
-#         sentences = [
-#             "Hello, my dog is a little",
-#             "Today, I",
-#         ]
-
-#         inputs = tokenizer(sentences, return_tensors="pt", padding=True)
-#         input_ids = inputs["input_ids"].to(torch_device)
-#         token_type_ids = torch.cat(
-#             [
-#                 input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
-#                 input_ids.new_full((input_ids.shape[0], 1), 500),
-#             ],
-#             dim=-1,
-#         )
-
-#         outputs = model.generate(
-#             input_ids=input_ids,
-#             attention_mask=inputs["attention_mask"].to(torch_device),
-#         )
-
-#         outputs_tt = model.generate(
-#             input_ids=input_ids,
-#             attention_mask=inputs["attention_mask"].to(torch_device),
-#             token_type_ids=token_type_ids,
-#         )
-
-#         inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
-#         output_non_padded = model.generate(input_ids=inputs_non_padded)
-
-#         num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
-#         inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
-#         output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)
-
-#         batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-#         batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
-#         non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
-#         padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)
-
-#         expected_output_sentence = [
-#             "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
-#             "Today, I'm going to be doing a lot of research on this. I",
-#         ]
-#         self.assertListEqual(expected_output_sentence, batch_out_sentence)
-#         self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
-#         self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
-
 #     @slow
 #     def test_model_from_pretrained(self):
 #         for model_name in JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
 #             model = JukeboxModel.from_pretrained(model_name)
 #             self.assertIsNotNone(model)
 
-
-# @require_torch
-# class JukeboxModelLanguageGenerationTest(unittest.TestCase):
-#     def _test_lm_generate_jukebox_helper(
-#         self,
-#         gradient_checkpointing=False,
-#         reorder_and_upcast_attn=False,
-#         scale_attn_by_inverse_layer_idx=False,
-#         verify_outputs=True,
-#     ):
-#         model = JukeboxLMHeadModel.from_pretrained(
-#             "jukebox",
-#             reorder_and_upcast_attn=reorder_and_upcast_attn,
-#             scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-#         )
-#         if gradient_checkpointing:
-#             model.gradient_checkpointing_enable()
-#         else:
-#             model.gradient_checkpointing_disable()
-#         model.to(torch_device)
-
-#         # The dog
-#         input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)
-
-#         # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
-#         # fmt: off
-#         expected_output_ids = [
-#             464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290,
-#         ]
-#         # fmt: on
-#         output_ids = model.generate(input_ids, do_sample=False)
-#         if verify_outputs:
-#             self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
-
-#     @slow
-#     def test_lm_generate_jukebox(self):
-#         self._test_lm_generate_jukebox_helper()
-
-#     @slow
-#     def test_lm_generate_jukebox_with_gradient_checkpointing(self):
-#         self._test_lm_generate_jukebox_helper(gradient_checkpointing=True)
-
-#     @slow
-#     def test_lm_generate_jukebox_with_reorder_and_upcast_attn(self):
-#         self._test_lm_generate_jukebox_helper(reorder_and_upcast_attn=True)
-
-#     @slow
-#     def test_lm_generate_jukebox_with_scale_attn_by_inverse_layer_idx(self):
-#         self._test_lm_generate_jukebox_helper(scale_attn_by_inverse_layer_idx=True, verify_outputs=False)
-
-#     @slow
-#     def test_jukebox_sample(self):
-#         tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
-#         model = JukeboxLMHeadModel.from_pretrained("jukebox")
-#         model.to(torch_device)
-
-#         torch.manual_seed(0)
-#         tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
-#         input_ids = tokenized.input_ids.to(torch_device)
-#         output_ids = model.generate(input_ids, do_sample=True)
-#         output_str = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-
-#         token_type_ids = tokenized.token_type_ids.to(torch_device)
-#         output_seq = model.generate(input_ids=input_ids, do_sample=True, num_return_sequences=5)
-#         output_seq_tt = model.generate(
-#             input_ids=input_ids, token_type_ids=token_type_ids, do_sample=True, num_return_sequences=5
-#         )
-#         output_seq_strs = tokenizer.batch_decode(output_seq, skip_special_tokens=True)
-#         output_seq_tt_strs = tokenizer.batch_decode(output_seq_tt, skip_special_tokens=True)
-
-#         EXPECTED_OUTPUT_STR = (
-#             "Today is a nice day and if you don't know anything about the state of play during your holiday"
-#         )
-#         self.assertEqual(output_str, EXPECTED_OUTPUT_STR)
-#         self.assertTrue(
-#             all([output_seq_strs[idx] != output_seq_tt_strs[idx] for idx in range(len(output_seq_tt_strs))])
-#         )  # token_type_ids should change output
-
-#     @slow
-#     def test_jukebox_sample_max_time(self):
-#         tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
-#         model = JukeboxLMHeadModel.from_pretrained("jukebox")
-#         model.to(torch_device)
-
-#         torch.manual_seed(0)
-#         tokenized = tokenizer("Today is a nice day and", return_tensors="pt", return_token_type_ids=True)
-#         input_ids = tokenized.input_ids.to(torch_device)
-
-#         MAX_TIME = 0.5
-
-#         start = datetime.datetime.now()
-#         model.generate(input_ids, do_sample=True, max_time=MAX_TIME, max_length=256)
-#         duration = datetime.datetime.now() - start
-#         self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-#         self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-#         start = datetime.datetime.now()
-#         model.generate(input_ids, do_sample=False, max_time=MAX_TIME, max_length=256)
-#         duration = datetime.datetime.now() - start
-#         self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-#         self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-#         start = datetime.datetime.now()
-#         model.generate(input_ids, do_sample=False, num_beams=2, max_time=MAX_TIME, max_length=256)
-#         duration = datetime.datetime.now() - start
-#         self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-#         self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-#         start = datetime.datetime.now()
-#         model.generate(input_ids, do_sample=True, num_beams=2, max_time=MAX_TIME, max_length=256)
-#         duration = datetime.datetime.now() - start
-#         self.assertGreater(duration, datetime.timedelta(seconds=MAX_TIME))
-#         self.assertLess(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))
-
-#         start = datetime.datetime.now()
-#         model.generate(input_ids, do_sample=False, max_time=None, max_length=256)
-#         duration = datetime.datetime.now() - start
-#         self.assertGreater(duration, datetime.timedelta(seconds=1.5 * MAX_TIME))

From 6c761c179f2aa011d02a92a7bccc106f6408606c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Mon, 4 Jul 2022 12:10:01 +0000
Subject: [PATCH 016/196] style

---
 .../models/jukebox/modeling_jukebox.py        |  11 +-
 .../models/jukebox/tokenization_jukebox.py    |  72 ++--
 tests/models/jukebox/test_modeling_jukebox.py | 349 ++++++++++++++----
 .../jukebox/test_tokenization_jukebox.py      |   2 +-
 4 files changed, 316 insertions(+), 118 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2703755bcd0c7..71c235ad2c36a 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -42,6 +42,7 @@
 from .configuration_jukebox import JukeboxConfig
 from .tokenization_jukebox import get_relevant_lyric_tokens
 
+
 logger = logging.get_logger(__name__)
 
 # _CHECKPOINT_FOR_DOC = "ArthurZ/jukebox-dummy"
@@ -705,9 +706,9 @@ def __init__(self, config):
         if not config.sample_length:
             downsamples = calculate_strides(config.vq_vae_strides_t, config.vq_vae_downs_t)
             top_raw_to_tokens = np.prod(downsamples)
-            config.sample_length = ((
-                config.sample_length_in_seconds * config.sr // top_raw_to_tokens
-            ) * top_raw_to_tokens).astype(int)
+            config.sample_length = (
+                (config.sample_length_in_seconds * config.sr // top_raw_to_tokens) * top_raw_to_tokens
+            ).astype(int)
 
         input_shape = (config.sample_length, 1)
         block_kwargs = dict(
@@ -2780,12 +2781,12 @@ def set_y_lyric_tokens(self, ys, labels):
             tokens_list = []
             indices_list = []  # whats the index of each current character in original array
             for i in range(ys.shape[0]):
-                full_tokens = labels['full_tokens']
+                full_tokens = labels["full_tokens"]
                 total_length, offset, duration = ys[i, 0], ys[i, 1], ys[i, 2]
                 tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
                 tokens_list.append(tokens)
                 indices_list.append(indices)
-            ys[:, -self.n_tokens:] = torch.tensor(tokens_list, dtype=torch.long, device='cpu')
+            ys[:, -self.n_tokens :] = torch.tensor(tokens_list, dtype=torch.long, device="cpu")
             return indices_list
         else:
             return None
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 94eebd3c2f3ae..d924874bf45bd 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -20,13 +20,14 @@
 from json.encoder import INFINITY
 from typing import Any, Dict, List, Optional, Tuple
 
+import torch
+
 import regex as re
 from tokenizers import normalizers
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
-import torch
 
 logger = logging.get_logger(__name__)
 
@@ -47,23 +48,22 @@
 
 def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
     """
-    Extract only the relevant tokens based on the character position. A total of
-    `max_n_lyric_tokens` tokens will be returned. If the provided token sequence is smaller, it will be padded,
-    othewise, only characters ranging from the midpoint - `max_n_lyric_tokens//2` to the midpoint +
-    `max_n_lyric_tokens//2` will be returned. This *focuses* on the most relevant tokens (in time) for the
-    sequence.
+    Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
+    returned. If the provided token sequence is smaller, it will be padded, othewise, only characters ranging from the
+    midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
+    the most relevant tokens (in time) for the sequence.
 
     Args: # TODO : args to prettify
         full_tokens (`List[int]`):
-            List containing the ids of the entire lyrics. 
+            List containing the ids of the entire lyrics.
         total_length (`int`):
-            Total expected length of the music (not all of it is generated, see duration), in samples. 
+            Total expected length of the music (not all of it is generated, see duration), in samples.
         offset (`int`):
-            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted 
-            take that into account
+            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
+            account
         duration (`int`):
-            Expected duration of the generated music, in seconds. The duration has to be smaller than the 
-            total lenght, which represent the overall length of the signal, 
+            Expected duration of the generated music, in seconds. The duration has to be smaller than the total lenght,
+            which represent the overall length of the signal,
     """
     if len(full_tokens) < max_n_lyric_tokens:
         tokens = [0] * (max_n_lyric_tokens - len(full_tokens)) + full_tokens
@@ -71,18 +71,15 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
     else:
         assert 0 <= offset < total_length
         midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
-        midpoint = min(
-            max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2
-        )
+        midpoint = min(max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2)
         tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2]
         indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
     assert len(tokens) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(tokens)}"
-    assert (
-        len(indices) == max_n_lyric_tokens
-    ), f"Expected length {max_n_lyric_tokens}, got {len(indices)}"
+    assert len(indices) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(indices)}"
     assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
     return tokens, indices
 
+
 class JukeboxTokenizer(PreTrainedTokenizer):
     """
     Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
@@ -139,7 +136,7 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     max_lyric_input_size = PRETRAINED_LYRIC_TOKENS_SIZES
     model_input_names = ["input_ids", "attention_mask"]
 
-    def __init__(self, vocab_file, n_genres=5, max_n_lyric_tokens = 512, unk_token="<|endoftext|>", **kwargs):
+    def __init__(self, vocab_file, n_genres=5, max_n_lyric_tokens=512, unk_token="<|endoftext|>", **kwargs):
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         super().__init__(
             unk_token=unk_token,
@@ -152,12 +149,12 @@ def __init__(self, vocab_file, n_genres=5, max_n_lyric_tokens = 512, unk_token="
             self.artists_encoder = vocabulary["artists"]
             self.genres_encoder = vocabulary["genres"]
             self.lyrics_encoder = vocabulary["lyrics"]
-        
+
         oov = "[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]"
         # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
-        if len(self.lyrics_encoder)== 79:
-            oov += "+" 
-            
+        if len(self.lyrics_encoder) == 79:
+            oov += "+"
+
         self.out_of_vocab = re.compile(oov)  # FIXME: should be an argument?
 
         self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
@@ -194,7 +191,7 @@ def _convert_token_to_id(self, artist, genres, lyrics, total_length, offset, dur
         genres_ids = [self.genres_encoder.get(genre) for genre in genres]
         genres_ids = genres_ids + [-1] * (self.n_genres - len(genres_ids))
         lyric_ids = [self.lyrics_encoder.get(character) for character in lyrics]
-        y,_ = get_relevant_lyric_tokens(lyric_ids, self.max_n_lyric_tokens,total_length, offset, duration)
+        y, _ = get_relevant_lyric_tokens(lyric_ids, self.max_n_lyric_tokens, total_length, offset, duration)
         return artists_id, genres_ids, y, lyric_ids
 
     def _tokenize(self, lyrics):
@@ -267,14 +264,16 @@ def _normalize(self, text: str) -> str:
         """
         import re
 
-        accepted = [chr(i) for i in range(ord("a"), ord("z") + 1)] \
-            + [chr(i) for i in range(ord("A"), ord("Z") + 1)] \
+        accepted = (
+            [chr(i) for i in range(ord("a"), ord("z") + 1)]
+            + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
             + [chr(i) for i in range(ord("0"), ord("9") + 1)]
+        )
 
-        # In v2, " " is not accepted while it is for v3 
-        if len(self.lyrics_encoder)== 79:
+        # In v2, " " is not accepted while it is for v3
+        if len(self.lyrics_encoder) == 79:
             accepted += [" "]
-        accepted = frozenset(accepted) 
+        accepted = frozenset(accepted)
         rex = re.compile(r"_+")
         text = "".join([c if c in accepted else "_" for c in text.lower()])
         text = rex.sub("_", text).strip("_")
@@ -284,11 +283,11 @@ def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
         """Converts an index (integer) in a token (str) using the vocab.
         Args:
             artists_index (`int`):
-                Index of the artist in its corresponding dictionnary. 
+                Index of the artist in its corresponding dictionnary.
             genres_index (`Union[List[int], int]`):
-               Index of the genre in its corresponding dictionnary. 
+               Index of the genre in its corresponding dictionnary.
             lyric_index (`List[int]`):
-                List of character indices, which each correspond to a character. 
+                List of character indices, which each correspond to a character.
         """
         artist = self.artists_decoder.get(artists_index)
         genres = [self.genres_decoder.get(genre) for genre in genres_index]
@@ -301,7 +300,7 @@ def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
     # TODO : should add_token be implemeted for artists, genres and lyrics? Should it have
     # a type argument to add an artist token with self.getattr('artist') ?
 
-    def __call__(self, artist, genres, lyrics, total_length, sample_length, offset, return_tensor = "pt"):
+    def __call__(self, artist, genres, lyrics, total_length, sample_length, offset, return_tensor="pt"):
         """
         Convert the raw string to token ids
 
@@ -318,7 +317,7 @@ def __call__(self, artist, genres, lyrics, total_length, sample_length, offset,
                 _description_
             offset (`int`):
                 _description_
-            max_n_lyric_tokens (`int`): 
+            max_n_lyric_tokens (`int`):
                 _description_
         """
         input_ids = [total_length, offset, sample_length]
@@ -330,7 +329,10 @@ def __call__(self, artist, genres, lyrics, total_length, sample_length, offset,
         attention_masks = [-INFINITY] * (len(full_tokens) - len(relevant_tokens)) + [0] * len(relevant_tokens)
         # TODO properly handle the return pt tensor option
         if return_tensor == "pt":
-            return {"input_ids": {'y': torch.tensor([input_ids]), 'full_tokens':full_tokens}, "attention_masks":torch.tensor( attention_masks)}
+            return {
+                "input_ids": {"y": torch.tensor([input_ids]), "full_tokens": full_tokens},
+                "attention_masks": torch.tensor(attention_masks),
+            }
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index ccf0cd507fd5b..6d531835f2ad0 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -39,11 +39,11 @@ class JukeboxModelTest(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
 
     metas = dict(
-            artist="Zac Brown Band",
-            genres="Country",
-            total_length=440960,
-            offset=0,
-            lyrics="""I met a traveller from an antique land,
+        artist="Zac Brown Band",
+        genres="Country",
+        total_length=440960,
+        offset=0,
+        lyrics="""I met a traveller from an antique land,
     Who said—“Two vast and trunkless legs of stone
     Stand in the desert. . . . Near them, on the sand,
     Half sunk a shattered visage lies, whose frown,
@@ -58,9 +58,9 @@ class JukeboxModelTest(unittest.TestCase):
     Of that colossal Wreck, boundless and bare
     The lone and level sands stretch far away
     """,
-            duration=2,
-            sample_length=786432,
-        )
+        duration=2,
+        sample_length=786432,
+    )
     # @slow
     def test_model(self):
         set_seed(0)
@@ -571,7 +571,7 @@ def test_model(self):
     def test_conditioning(self):
         pass
         # x,x_conds and y_conds should be the same  before calling the sampling
-        # start and end embeding 
+        # start and end embeding
         # expected conditioning to match
 
     def test_1b_lyrics(self):
@@ -581,7 +581,6 @@ def test_1b_lyrics(self):
 
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox", max_n_lyric_tokens=384)
 
-
         sampling_temperature = 0.98
         lower_batch_size = 16
         max_batch_size = 16
@@ -593,53 +592,141 @@ def test_1b_lyrics(self):
             dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size),
         ]
 
-
-        self.metas.sample_length=model.priors[-1].sample_length
+        self.metas.sample_length = model.priors[-1].sample_length
 
         tokens = tokenizer(**self.metas)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-        
-        zs = [torch.zeros(1,0,dtype = torch.long).cpu() for _ in range(len(model.priors))] 
-        labels = [{},{}, inputs]
 
+        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(len(model.priors))]
+        labels = [{}, {}, inputs]
 
         set_seed(0)
         torch.backends.cuda.matmul.allow_tf32 = False
         torch.backends.cudnn.enabled = False
-        zs = model._sample(zs,labels , sampling_kwargs, [2],model.config)
-
-        EXPECTED_OUTPUT = torch.tensor([1489, 1489,  324, 1489, 1599, 1072, 1357, 1489,  784, 1272])
-        
-        # TODO generate the original outputs 
-        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
-         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
-         653, 1434, 1434, 1536, 1599,  710])
-        assert torch.allclose(zs[-1][0,:30],EXPECTED_OUTPUT)
-
-
-        labels[1]['y']= inputs['y'][:,:9]
-        labels[0]['y']= inputs['y'][:,:9]
-
-        zs[-1] = torch.cat((zs[-1], torch.zeros(1,2048-zs[-1].shape[-1]).cpu()),dim=-1)
-        zs = model._sample(zs,labels , sampling_kwargs, [1],model.config)
-        # TODO find the expected outputs 
-        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
-         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
-         653, 1434, 1434, 1536, 1599,  710])
-        assert torch.allclose(zs[-2][0,:30],EXPECTED_OUTPUT)
-
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1,2048-zs[-1].shape[-1]).cpu()),dim=-1)
-        zs = model._sample(zs,labels , sampling_kwargs, [0],model.config)
-        # TODO find the expected outputs 
-        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
-         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
-         653, 1434, 1434, 1536, 1599,  710])
-        assert torch.allclose(zs[0][0,:30],EXPECTED_OUTPUT)
-
-
-        
+        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+
+        EXPECTED_OUTPUT = torch.tensor([1489, 1489, 324, 1489, 1599, 1072, 1357, 1489, 784, 1272])
+
+        # TODO generate the original outputs
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                1489,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                1434,
+                1434,
+                653,
+                1357,
+                653,
+                1434,
+                1434,
+                1536,
+                1599,
+                710,
+            ]
+        )
+        assert torch.allclose(zs[-1][0, :30], EXPECTED_OUTPUT)
+
+        labels[1]["y"] = inputs["y"][:, :9]
+        labels[0]["y"] = inputs["y"][:, :9]
+
+        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
+        zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
+        # TODO find the expected outputs
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                1489,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                1434,
+                1434,
+                653,
+                1357,
+                653,
+                1434,
+                1434,
+                1536,
+                1599,
+                710,
+            ]
+        )
+        assert torch.allclose(zs[-2][0, :30], EXPECTED_OUTPUT)
+
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
+        zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
+        # TODO find the expected outputs
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                1489,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                1434,
+                1434,
+                653,
+                1357,
+                653,
+                1434,
+                1434,
+                1536,
+                1599,
+                710,
+            ]
+        )
+        assert torch.allclose(zs[0][0, :30], EXPECTED_OUTPUT)
+
     def test_5b_lyrics(self):
-        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-5b-lyrics").eval() 
+        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-5b-lyrics").eval()
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox-5b-lyrics")
         set_seed(0)
 
@@ -649,48 +736,157 @@ def test_5b_lyrics(self):
         lower_level_chunk_size = 32
         chunk_size = 32
         sampling_kwargs = [
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=30),
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=30),
-            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size, sample_tokens=30),
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=lower_level_chunk_size,
+                sample_tokens=30,
+            ),
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=lower_level_chunk_size,
+                sample_tokens=30,
+            ),
+            dict(
+                temp=sampling_temperature,
+                fp16=False,
+                max_batch_size=max_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=30,
+            ),
         ]
 
         tokens = tokenizer(**self.metas)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-        
-        zs = [torch.zeros(1,0,dtype = torch.long).cpu() for _ in range(len(model.priors))] 
+
+        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(len(model.priors))]
         labels = [{}, {}, inputs]
         # model = model.cuda()
 
         set_seed(0)
         torch.backends.cuda.matmul.allow_tf32 = False
         torch.backends.cudnn.enabled = False
-        zs = model._sample(zs,labels , sampling_kwargs, [2],model.config)
+        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
 
-        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
-         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
-         653, 1434, 1434, 1536, 1599,  710])
-        assert torch.allclose(zs[-1][0,:30],EXPECTED_OUTPUT)
-
-
-        labels[1]['y']= inputs['y'][:,:9]
-        labels[0]['y']= inputs['y'][:,:9]
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                1489,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                1434,
+                1434,
+                653,
+                1357,
+                653,
+                1434,
+                1434,
+                1536,
+                1599,
+                710,
+            ]
+        )
+        assert torch.allclose(zs[-1][0, :30], EXPECTED_OUTPUT)
 
-        zs[-1] = torch.cat((zs[-1], torch.zeros(1,2048-zs[-1].shape[-1]).cpu()),dim=-1)
-        zs = model._sample(zs,labels , sampling_kwargs, [1],model.config)
-        # TODO find the expected outputs 
-        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
-         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
-         653, 1434, 1434, 1536, 1599,  710])
-        assert torch.allclose(zs[-2][0,:30],EXPECTED_OUTPUT)
+        labels[1]["y"] = inputs["y"][:, :9]
+        labels[0]["y"] = inputs["y"][:, :9]
 
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1,2048-zs[-1].shape[-1]).cpu()),dim=-1)
-        zs = model._sample(zs,labels , sampling_kwargs, [0],model.config)
-        # TODO find the expected outputs 
-        EXPECTED_OUTPUT = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,  653,
-         653,  653,  653,  653,  653,  653,  653,  653, 1434, 1434,  653, 1357,
-         653, 1434, 1434, 1536, 1599,  710])
-        assert torch.allclose(zs[0][0,:30],EXPECTED_OUTPUT)
+        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
+        zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
+        # TODO find the expected outputs
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                1489,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                1434,
+                1434,
+                653,
+                1357,
+                653,
+                1434,
+                1434,
+                1536,
+                1599,
+                710,
+            ]
+        )
+        assert torch.allclose(zs[-2][0, :30], EXPECTED_OUTPUT)
 
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
+        zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
+        # TODO find the expected outputs
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                1489,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                1434,
+                1434,
+                653,
+                1357,
+                653,
+                1434,
+                1434,
+                1536,
+                1599,
+                710,
+            ]
+        )
+        assert torch.allclose(zs[0][0, :30], EXPECTED_OUTPUT)
 
 
 if __name__ == "__main__":
@@ -1183,4 +1379,3 @@ def test_5b_lyrics(self):
 #         for model_name in JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
 #             model = JukeboxModel.from_pretrained(model_name)
 #             self.assertIsNotNone(model)
-
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
index 6b50b10251b80..83ea3509c95cd 100644
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -40,7 +40,7 @@ def test_tokenizer(self):
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
         tokenizer.max_n_lyric_tokens = 20
         tokens = tokenizer("Alan Jackson", "rock", "old town road", 4 * 60 * 44100, 8192 * 8 * 4 * 4, 0)
-        inputs, attention_masks = tokens["input_ids"]['y'], tokens["attention_masks"]
+        inputs, attention_masks = tokens["input_ids"]["y"], tokens["attention_masks"]
         EXPECTED_OUTPUT = [
             10584000,
             0,

From adc849d5cc4dfec8f563b46a04cf56fe55155c77 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Mon, 4 Jul 2022 13:04:26 +0000
Subject: [PATCH 017/196] clean test

---
 tests/models/jukebox/test_modeling_jukebox.py | 99 ++++++-------------
 1 file changed, 31 insertions(+), 68 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 6d531835f2ad0..b45df74510f8d 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -58,8 +58,6 @@ class JukeboxModelTest(unittest.TestCase):
     Of that colossal Wreck, boundless and bare
     The lone and level sands stretch far away
     """,
-        duration=2,
-        sample_length=786432,
     )
     # @slow
     def test_model(self):
@@ -573,39 +571,43 @@ def test_conditioning(self):
         # x,x_conds and y_conds should be the same  before calling the sampling
         # start and end embeding
         # expected conditioning to match
-
-    def test_1b_lyrics(self):
-        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-1b-lyrics").eval()  # .to("cuda")
-
-        # model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 386 * [0]]).long().to("cuda"), chunk_size=32)
-
-        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox", max_n_lyric_tokens=384)
-
+    def prepare_inputs(self,model_id, sample_length, chunk_size =32):
+        tokenizer = JukeboxTokenizer.from_pretrained(model_id, max_n_lyric_tokens=384)
+         # create sampling parameters
         sampling_temperature = 0.98
         lower_batch_size = 16
         max_batch_size = 16
         lower_level_chunk_size = 32
-        chunk_size = 32
         sampling_kwargs = [
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
-            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=30),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=30),
+            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size, sample_tokens=30),
         ]
 
-        self.metas.sample_length = model.priors[-1].sample_length
 
-        tokens = tokenizer(**self.metas)
+        tokens = tokenizer(**self.metas, sample_length = sample_length )
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
 
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(len(model.priors))]
-        labels = [{}, {}, inputs]
+       
+        labels = [inputs*3]
+        labels[1]["y"] = inputs["y"][:, :9]
+        labels[0]["y"] = inputs["y"][:, :9]
+
+        return labels, sampling_kwargs
 
+    # @slow 
+    def test_1b_lyrics(self):
         set_seed(0)
         torch.backends.cuda.matmul.allow_tf32 = False
         torch.backends.cudnn.enabled = False
-        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
 
-        EXPECTED_OUTPUT = torch.tensor([1489, 1489, 324, 1489, 1599, 1072, 1357, 1489, 784, 1272])
+        model_id = "ArthurZ/jukebox-1b-lyrics"
+        model = JukeboxModel.from_pretrained(model_id).eval() 
+        
+        labels, sampling_kwargs = self.prepare_inputs(model_id, model.priors[-1].sample_length)
+
+        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
+        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
 
         # TODO generate the original outputs
         EXPECTED_OUTPUT = torch.tensor(
@@ -644,9 +646,6 @@ def test_1b_lyrics(self):
         )
         assert torch.allclose(zs[-1][0, :30], EXPECTED_OUTPUT)
 
-        labels[1]["y"] = inputs["y"][:, :9]
-        labels[0]["y"] = inputs["y"][:, :9]
-
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
         # TODO find the expected outputs
@@ -686,7 +685,7 @@ def test_1b_lyrics(self):
         )
         assert torch.allclose(zs[-2][0, :30], EXPECTED_OUTPUT)
 
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 4096 - zs[-2].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
         # TODO find the expected outputs
         EXPECTED_OUTPUT = torch.tensor(
@@ -725,52 +724,18 @@ def test_1b_lyrics(self):
         )
         assert torch.allclose(zs[0][0, :30], EXPECTED_OUTPUT)
 
-    def test_5b_lyrics(self):
-        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-5b-lyrics").eval()
-        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox-5b-lyrics")
+    def test_5b_lyrics(self):        
         set_seed(0)
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.enabled = False
 
-        sampling_temperature = 0.98
-        lower_batch_size = 16
-        max_batch_size = 16
-        lower_level_chunk_size = 32
-        chunk_size = 32
-        sampling_kwargs = [
-            dict(
-                temp=0.99,
-                fp16=False,
-                max_batch_size=lower_batch_size,
-                chunk_size=lower_level_chunk_size,
-                sample_tokens=30,
-            ),
-            dict(
-                temp=0.99,
-                fp16=False,
-                max_batch_size=lower_batch_size,
-                chunk_size=lower_level_chunk_size,
-                sample_tokens=30,
-            ),
-            dict(
-                temp=sampling_temperature,
-                fp16=False,
-                max_batch_size=max_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=30,
-            ),
-        ]
-
-        tokens = tokenizer(**self.metas)
-        inputs, _ = tokens["input_ids"], tokens["attention_masks"]
+        model_id = "ArthurZ/jukebox-5b-lyrics"
+        model = JukeboxModel.from_pretrained(model_id).eval() 
+        
+        labels, sampling_kwargs = self.prepare_inputs(model_id, model.priors[-1].sample_length, chunk_size = 16)
 
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(len(model.priors))]
-        labels = [{}, {}, inputs]
-        # model = model.cuda()
-
-        set_seed(0)
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.enabled = False
         zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
-
         EXPECTED_OUTPUT = torch.tensor(
             [
                 1489,
@@ -807,8 +772,6 @@ def test_5b_lyrics(self):
         )
         assert torch.allclose(zs[-1][0, :30], EXPECTED_OUTPUT)
 
-        labels[1]["y"] = inputs["y"][:, :9]
-        labels[0]["y"] = inputs["y"][:, :9]
 
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
@@ -849,7 +812,7 @@ def test_5b_lyrics(self):
         )
         assert torch.allclose(zs[-2][0, :30], EXPECTED_OUTPUT)
 
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 4096 - zs[-2].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
         # TODO find the expected outputs
         EXPECTED_OUTPUT = torch.tensor(

From 1e5a94e8e4fac948856974f4313d286217910ed2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Mon, 4 Jul 2022 15:19:48 +0000
Subject: [PATCH 018/196] fix tokenizer

---
 .../models/jukebox/tokenization_jukebox.py    |  11 +-
 tests/models/jukebox/test_modeling_jukebox.py | 514 +-----------------
 2 files changed, 22 insertions(+), 503 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index d924874bf45bd..2ff5a5f37e081 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -62,7 +62,7 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
             Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
             account
         duration (`int`):
-            Expected duration of the generated music, in seconds. The duration has to be smaller than the total lenght,
+            Expected duration of the generated music, in samples. The duration has to be smaller than the total lenght,
             which represent the overall length of the signal,
     """
     if len(full_tokens) < max_n_lyric_tokens:
@@ -140,6 +140,8 @@ def __init__(self, vocab_file, n_genres=5, max_n_lyric_tokens=512, unk_token="<|
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         super().__init__(
             unk_token=unk_token,
+            n_genres=n_genres, 
+            max_n_lyric_tokens=max_n_lyric_tokens,
             **kwargs,
         )
         self.n_genres = n_genres
@@ -150,13 +152,12 @@ def __init__(self, vocab_file, n_genres=5, max_n_lyric_tokens=512, unk_token="<|
             self.genres_encoder = vocabulary["genres"]
             self.lyrics_encoder = vocabulary["lyrics"]
 
-        oov = "[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]"
+        oov = '[^A-Za-z0-9.,:;!?\-\'\"()\[\] \t\n]+'
         # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
         if len(self.lyrics_encoder) == 79:
-            oov += "+"
-
-        self.out_of_vocab = re.compile(oov)  # FIXME: should be an argument?
+            oov = oov.replace("\-\'","\-+\'")
 
+        self.out_of_vocab = re.compile(oov) 
         self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
         self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
         self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index b45df74510f8d..aab8bb430ec83 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -41,7 +41,6 @@ class JukeboxModelTest(unittest.TestCase):
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
-        total_length=440960,
         offset=0,
         lyrics="""I met a traveller from an antique land,
     Who said—“Two vast and trunkless legs of stone
@@ -571,27 +570,30 @@ def test_conditioning(self):
         # x,x_conds and y_conds should be the same  before calling the sampling
         # start and end embeding
         # expected conditioning to match
-    def prepare_inputs(self,model_id, sample_length, chunk_size =32):
-        tokenizer = JukeboxTokenizer.from_pretrained(model_id, max_n_lyric_tokens=384)
+
+    def prepare_inputs(self,model, model_id, chunk_size =32):
+        tokenizer = JukeboxTokenizer.from_pretrained(model_id)
          # create sampling parameters
         sampling_temperature = 0.98
         lower_batch_size = 16
         max_batch_size = 16
         lower_level_chunk_size = 32
         sampling_kwargs = [
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=30),
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=30),
-            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size, sample_tokens=30),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=10),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=10),
+            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size, sample_tokens=10),
         ]
 
-
-        tokens = tokenizer(**self.metas, sample_length = sample_length )
+        sample_length_in_seconds = 24     
+        top_prior = model.priors[-1]
+        total_length = (int(sample_length_in_seconds*model.config.sr)//top_prior.raw_to_tokens)*top_prior.raw_to_tokens
+        tokens = tokenizer(**self.metas, sample_length =  top_prior.sample_length, total_length = total_length)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
 
        
-        labels = [inputs*3]
-        labels[1]["y"] = inputs["y"][:, :9]
-        labels[0]["y"] = inputs["y"][:, :9]
+        labels = [inputs.copy() for i in range(3)]
+        labels[1]["y"] = labels[1]["y"][:, :9]
+        labels[0]["y"] = labels[0]["y"][:, :9]
 
         return labels, sampling_kwargs
 
@@ -604,7 +606,7 @@ def test_1b_lyrics(self):
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval() 
         
-        labels, sampling_kwargs = self.prepare_inputs(model_id, model.priors[-1].sample_length)
+        labels, sampling_kwargs = self.prepare_inputs(model,model_id)
 
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
@@ -854,491 +856,7 @@ def test_5b_lyrics(self):
 
 if __name__ == "__main__":
     tester = JukeboxModelTest()
-    tester.test_gpu_sampling()
-
-# class JukeboxModelTester:
-#     def __init__(
-#         self,
-#         parent,
-#         batch_size=14,
-#         seq_length=7,
-#         is_training=True,
-#         use_token_type_ids=True,
-#         use_input_mask=True,
-#         use_labels=True,
-#         use_mc_token_ids=True,
-#         vocab_size=99,
-#         hidden_size=32,
-#         num_hidden_layers=5,
-#         num_attention_heads=4,
-#         intermediate_size=37,
-#         hidden_act="gelu",
-#         hidden_dropout_prob=0.1,
-#         attention_probs_dropout_prob=0.1,
-#         max_position_embeddings=512,
-#         type_vocab_size=16,
-#         type_sequence_label_size=2,
-#         initializer_range=0.02,
-#         num_labels=3,
-#         num_choices=4,
-#         scope=None,
-#     ):
-#         self.parent = parent
-#         self.batch_size = batch_size
-#         self.seq_length = seq_length
-#         self.is_training = is_training
-#         self.use_token_type_ids = use_token_type_ids
-#         self.use_input_mask = use_input_mask
-#         self.use_labels = use_labels
-#         self.use_mc_token_ids = use_mc_token_ids
-#         self.vocab_size = vocab_size
-#         self.hidden_size = hidden_size
-#         self.num_hidden_layers = num_hidden_layers
-#         self.num_attention_heads = num_attention_heads
-#         self.intermediate_size = intermediate_size
-#         self.hidden_act = hidden_act
-#         self.hidden_dropout_prob = hidden_dropout_prob
-#         self.attention_probs_dropout_prob = attention_probs_dropout_prob
-#         self.max_position_embeddings = max_position_embeddings
-#         self.type_vocab_size = type_vocab_size
-#         self.type_sequence_label_size = type_sequence_label_size
-#         self.initializer_range = initializer_range
-#         self.num_labels = num_labels
-#         self.num_choices = num_choices
-#         self.scope = None
-#         self.bos_token_id = vocab_size - 1
-#         self.eos_token_id = vocab_size - 1
-#         self.pad_token_id = vocab_size - 1
-
-#     def get_large_model_config(self):
-#         return JukeboxConfig.from_pretrained("jukebox")
-
-#     def prepare_config_and_inputs(
-#         self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-#     ):
-#         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-#         input_mask = None
-#         if self.use_input_mask:
-#             input_mask = random_attention_mask([self.batch_size, self.seq_length])
-
-#         token_type_ids = None
-#         if self.use_token_type_ids:
-#             token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-#         mc_token_ids = None
-#         if self.use_mc_token_ids:
-#             mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
-
-#         sequence_labels = None
-#         token_labels = None
-#         choice_labels = None
-#         if self.use_labels:
-#             sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-#             token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-#             choice_labels = ids_tensor([self.batch_size], self.num_choices)
-
-#         config = self.get_config(
-#             gradient_checkpointing=gradient_checkpointing,
-#             scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-#             reorder_and_upcast_attn=reorder_and_upcast_attn,
-#         )
-
-#         head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-
-#         return (
-#             config,
-#             input_ids,
-#             input_mask,
-#             head_mask,
-#             token_type_ids,
-#             mc_token_ids,
-#             sequence_labels,
-#             token_labels,
-#             choice_labels,
-#         )
-
-#     def get_config(
-#         self, gradient_checkpointing=False, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False
-#     ):
-#         return JukeboxConfig(
-#             vocab_size=self.vocab_size,
-#             n_embd=self.hidden_size,
-#             n_layer=self.num_hidden_layers,
-#             n_head=self.num_attention_heads,
-#             n_inner=self.intermediate_size,
-#             activation_function=self.hidden_act,
-#             resid_pdrop=self.hidden_dropout_prob,
-#             attn_pdrop=self.attention_probs_dropout_prob,
-#             n_positions=self.max_position_embeddings,
-#             type_vocab_size=self.type_vocab_size,
-#             initializer_range=self.initializer_range,
-#             use_cache=True,
-#             bos_token_id=self.bos_token_id,
-#             eos_token_id=self.eos_token_id,
-#             pad_token_id=self.pad_token_id,
-#             gradient_checkpointing=gradient_checkpointing,
-#             scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
-#             reorder_and_upcast_attn=reorder_and_upcast_attn,
-#         )
-
-#     def prepare_config_and_inputs_for_decoder(self):
-#         (
-#             config,
-#             input_ids,
-#             input_mask,
-#             head_mask,
-#             token_type_ids,
-#             mc_token_ids,
-#             sequence_labels,
-#             token_labels,
-#             choice_labels,
-#         ) = self.prepare_config_and_inputs()
-
-#         encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-#         encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-#         return (
-#             config,
-#             input_ids,
-#             input_mask,
-#             head_mask,
-#             token_type_ids,
-#             sequence_labels,
-#             token_labels,
-#             choice_labels,
-#             encoder_hidden_states,
-#             encoder_attention_mask,
-#         )
-
-#     def create_and_check_jukebox_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-#         model = JukeboxModel(config=config)
-#         model.to(torch_device)
-#         model.eval()
-
-#         result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
-#         result = model(input_ids, token_type_ids=token_type_ids)
-#         result = model(input_ids)
-
-#         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-#         self.parent.assertEqual(len(result.past_key_values), config.n_layer)
-
-#     def create_and_check_jukebox_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-#         model = JukeboxModel(config=config)
-#         model.to(torch_device)
-#         model.eval()
-
-#         # first forward pass
-#         outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)
-#         outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids)
-#         outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False)
-
-#         self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
-#         self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
-
-#         output, past = outputs.to_tuple()
-
-#         # create hypothetical next token and extent to next_input_ids
-#         next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-#         next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size)
-
-#         # append to next input_ids and token_type_ids
-#         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-#         next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
-
-#         output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-#         output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
-#             "last_hidden_state"
-#         ]
-
-#         # select random slice
-#         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-#         output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-#         output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-#         # test that outputs are equal for slice
-#         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-#     def create_and_check_jukebox_model_attention_mask_past(
-#         self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-#     ):
-#         model = JukeboxModel(config=config)
-#         model.to(torch_device)
-#         model.eval()
-
-#         # create attention mask
-#         attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
-#         half_seq_length = self.seq_length // 2
-#         attn_mask[:, half_seq_length:] = 0
-
-#         # first forward pass
-#         output, past = model(input_ids, attention_mask=attn_mask).to_tuple()
-
-#         # create hypothetical next token and extent to next_input_ids
-#         next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
-
-#         # change a random masked slice from input_ids
-#         random_seq_idx_to_change = ids_tensor((1,), half_seq_length).item() + 1
-#         random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1)
-#         input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens
-
-#         # append to next input_ids and attn_mask
-#         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-#         attn_mask = torch.cat(
-#             [attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device)],
-#             dim=1,
-#         )
-
-#         # get two different outputs
-#         output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-#         output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
-
-#         # select random slice
-#         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-#         output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
-#         output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
-
-#         # test that outputs are equal for slice
-#         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-#     def create_and_check_jukebox_model_past_large_inputs(
-#         self, config, input_ids, input_mask, head_mask, token_type_ids, *args
-#     ):
-#         model = JukeboxModel(config=config)
-#         model.to(torch_device)
-#         model.eval()
-
-#         # first forward pass
-#         outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, use_cache=True)
-
-#         output, past = outputs.to_tuple()
-
-#         # create hypothetical next token and extent to next_input_ids
-#         next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-#         next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)
-#         next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-#         # append to next input_ids and token_type_ids
-#         next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-#         next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
-#         next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-#         output_from_no_past = model(
-#             next_input_ids, token_type_ids=next_token_type_ids, attention_mask=next_attention_mask
-#         )["last_hidden_state"]
-#         output_from_past = model(
-#             next_tokens, token_type_ids=next_token_types, attention_mask=next_attention_mask, past_key_values=past
-#         )["last_hidden_state"]
-#         self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
-
-#         # select random slice
-#         random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-#         output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-#         output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-#         # test that outputs are equal for slice
-#         self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-#     def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
-#         model = JukeboxLMHeadModel(config)
-#         model.to(torch_device)
-#         model.eval()
-
-#         result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-#         self.parent.assertEqual(result.loss.shape, ())
-#         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
-#     def create_and_check_forward_and_backwards(
-#         self, config, input_ids, input_mask, head_mask, token_type_ids, *args, gradient_checkpointing=False
-#     ):
-#         model = JukeboxLMHeadModel(config)
-#         model.to(torch_device)
-#         if gradient_checkpointing:
-#             model.gradient_checkpointing_enable()
-
-#         result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
-#         self.parent.assertEqual(result.loss.shape, ())
-#         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-#         result.loss.backward()
-
-#     def create_and_check_double_lm_head_model(
-#         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
-#     ):
-#         model = JukeboxDoubleHeadsModel(config)
-#         model.to(torch_device)
-#         model.eval()
-
-#         multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-#         multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-#         multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-
-#         inputs = {
-#             "input_ids": multiple_choice_inputs_ids,
-#             "mc_token_ids": mc_token_ids,
-#             "attention_mask": multiple_choice_input_mask,
-#             "token_type_ids": multiple_choice_token_type_ids,
-#             "labels": multiple_choice_inputs_ids,
-#         }
-
-#         result = model(**inputs)
-#         self.parent.assertEqual(result.loss.shape, ())
-#         self.parent.assertEqual(
-#             result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
-#         )
-#         self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
-
-#     def create_and_check_jukebox_for_sequence_classification(
-#         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-#     ):
-#         config.num_labels = self.num_labels
-#         model = JukeboxForSequenceClassification(config)
-#         model.to(torch_device)
-#         model.eval()
-#         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-#         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-#     def create_and_check_jukebox_for_token_classification(
-#         self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args
-#     ):
-#         config.num_labels = self.num_labels
-#         model = JukeboxForTokenClassification(config)
-#         model.to(torch_device)
-#         model.eval()
-#         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-#         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-#     def create_and_check_jukebox_weight_initialization(self, config, *args):
-#         model = JukeboxModel(config)
-#         model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
-#         for key in model.state_dict().keys():
-#             if "c_proj" in key and "weight" in key:
-#                 self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
-#                 self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
-
-#     def prepare_config_and_inputs_for_common(self):
-#         config_and_inputs = self.prepare_config_and_inputs()
-
-#         (
-#             config,
-#             input_ids,
-#             input_mask,
-#             head_mask,
-#             token_type_ids,
-#             mc_token_ids,
-#             sequence_labels,
-#             token_labels,
-#             choice_labels,
-#         ) = config_and_inputs
-
-#         inputs_dict = {
-#             "input_ids": input_ids,
-#             "token_type_ids": token_type_ids,
-#             "head_mask": head_mask,
-#         }
-
-#         return config, inputs_dict
-
-
-# @require_torch
-# class JukeboxModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
-#     all_model_classes = (
-#         (
-#             JukeboxModel,
-#             JukeboxLMHeadModel,
-#             JukeboxDoubleHeadsModel,
-#             JukeboxForSequenceClassification,
-#             JukeboxForTokenClassification,
-#         )
-#         if is_torch_available()
-#         else ()
-#     )
-#     all_generative_model_classes = (JukeboxLMHeadModel, JukeboxDoubleHeadsModel) if is_torch_available() else ()
-#     all_parallelizable_model_classes = (JukeboxLMHeadModel, JukeboxDoubleHeadsModel) if is_torch_available() else ()
-#     fx_compatible = False
-#     test_missing_keys = False
-#     test_model_parallel = True
-
-#     # special case for DoubleHeads model
-#     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
-#         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
-
-#         if return_labels:
-#             if model_class.__name__ == "JukeboxDoubleHeadsModel":
-#                 inputs_dict["labels"] = torch.zeros(
-#                     (self.model_tester.batch_size, self.model_tester.num_choices, self.model_tester.seq_length),
-#                     dtype=torch.long,
-#                     device=torch_device,
-#                 )
-#                 inputs_dict["input_ids"] = inputs_dict["labels"]
-#                 inputs_dict["token_type_ids"] = inputs_dict["labels"]
-#                 inputs_dict["mc_token_ids"] = torch.zeros(
-#                     (self.model_tester.batch_size, self.model_tester.num_choices),
-#                     dtype=torch.long,
-#                     device=torch_device,
-#                 )
-#                 inputs_dict["mc_labels"] = torch.zeros(
-#                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
-#                 )
-#         return inputs_dict
-
-#     def setUp(self):
-#         self.model_tester = JukeboxModelTester(self)
-#         self.config_tester = ConfigTester(self, config_class=JukeboxConfig, n_embd=37)
-
-#     def test_config(self):
-#         self.config_tester.run_common_tests()
-
-#     def test_jukebox_model(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_jukebox_model(*config_and_inputs)
-
-#     def test_jukebox_model_past(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_jukebox_model_past(*config_and_inputs)
-
-#     def test_jukebox_model_att_mask_past(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_jukebox_model_attention_mask_past(*config_and_inputs)
-
-#     def test_jukebox_model_past_large_inputs(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_jukebox_model_past_large_inputs(*config_and_inputs)
-
-#     def test_jukebox_lm_head_model(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
-
-#     def test_jukebox_double_lm_head_model(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
-
-#     def test_jukebox_sequence_classification_model(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_jukebox_for_sequence_classification(*config_and_inputs)
-
-#     def test_jukebox_token_classification_model(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_jukebox_for_token_classification(*config_and_inputs)
-
-#     def test_jukebox_gradient_checkpointing(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs, gradient_checkpointing=True)
-
-#     def test_jukebox_scale_attn_by_inverse_layer_idx(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs(scale_attn_by_inverse_layer_idx=True)
-#         self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
-
-#     def test_jukebox_reorder_and_upcast_attn(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs(reorder_and_upcast_attn=True)
-#         self.model_tester.create_and_check_forward_and_backwards(*config_and_inputs)
+    tester.test_1b_lyrics()
+    tester.test_5b_lyrics()
 
-#     def test_jukebox_weight_initialization(self):
-#         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-#         self.model_tester.create_and_check_jukebox_weight_initialization(*config_and_inputs)
 
-#     @slow
-#     def test_model_from_pretrained(self):
-#         for model_name in JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-#             model = JukeboxModel.from_pretrained(model_name)
-#             self.assertIsNotNone(model)

From 0450a376c1010769de9d76d2f1405f0c684729df Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Mon, 4 Jul 2022 16:43:06 +0000
Subject: [PATCH 019/196] 1b expected outputs and major update

---
 tests/models/jukebox/test_modeling_jukebox.py | 142 ++++--------------
 1 file changed, 26 insertions(+), 116 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index aab8bb430ec83..e23a7a78eab33 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -42,8 +42,9 @@ class JukeboxModelTest(unittest.TestCase):
         artist="Zac Brown Band",
         genres="Country",
         offset=0,
-        lyrics="""I met a traveller from an antique land,
-    Who said—“Two vast and trunkless legs of stone
+        lyrics=
+    """I met a traveller from an antique land,
+    Who said "Two vast and trunkless legs of stone
     Stand in the desert. . . . Near them, on the sand,
     Half sunk a shattered visage lies, whose frown,
     And wrinkled lip, and sneer of cold command,
@@ -571,16 +572,15 @@ def test_conditioning(self):
         # start and end embeding
         # expected conditioning to match
 
-    def prepare_inputs(self,model, model_id, chunk_size =32):
+    def prepare_inputs(self,model, model_id, chunk_size = 32):
         tokenizer = JukeboxTokenizer.from_pretrained(model_id)
          # create sampling parameters
         sampling_temperature = 0.98
         lower_batch_size = 16
         max_batch_size = 16
-        lower_level_chunk_size = 32
         sampling_kwargs = [
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=10),
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size, sample_tokens=10),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=chunk_size, sample_tokens=10),
+            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=chunk_size, sample_tokens=10),
             dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size, sample_tokens=10),
         ]
 
@@ -592,14 +592,13 @@ def prepare_inputs(self,model, model_id, chunk_size =32):
 
        
         labels = [inputs.copy() for i in range(3)]
-        labels[1]["y"] = labels[1]["y"][:, :9]
-        labels[0]["y"] = labels[0]["y"][:, :9]
+        labels[1]["y"] = labels[1]["y"][:, :(4+tokenizer.n_genres)]
+        labels[0]["y"] = labels[0]["y"][:, :(4+tokenizer.n_genres)]
 
         return labels, sampling_kwargs
 
     # @slow 
     def test_1b_lyrics(self):
-        set_seed(0)
         torch.backends.cuda.matmul.allow_tf32 = False
         torch.backends.cudnn.enabled = False
 
@@ -608,123 +607,34 @@ def test_1b_lyrics(self):
         
         labels, sampling_kwargs = self.prepare_inputs(model,model_id)
 
+        set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
 
         # TODO generate the original outputs
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                1489,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                1434,
-                1434,
-                653,
-                1357,
-                653,
-                1434,
-                1434,
-                1536,
-                1599,
-                710,
-            ]
-        )
-        assert torch.allclose(zs[-1][0, :30], EXPECTED_OUTPUT)
+        EXPECTED_OUTPUT = torch.tensor([1864, 1536, 1213, 1869, 1321, 1597,  519,  947, 1177,  789, 1434,  653,
+          653,  653,  653,  653,  653,  653,  653,  653, 1007, 1472,  255, 1228,
+          555, 1272, 1379, 1423, 1673,  427, 1683, 1321,  475,  416, 1177, 1827,
+         1106, 1127, 1494,  812])
+        assert torch.allclose(zs[-1][0], EXPECTED_OUTPUT)
 
-        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
+        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
         # TODO find the expected outputs
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                1489,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                1434,
-                1434,
-                653,
-                1357,
-                653,
-                1434,
-                1434,
-                1536,
-                1599,
-                710,
-            ]
-        )
-        assert torch.allclose(zs[-2][0, :30], EXPECTED_OUTPUT)
+        EXPECTED_OUTPUT = torch.tensor([904, 2037,  343, 1372,  135,  717,  506,  157,  307, 1419, 1751,  343,
+          899, 1803,  573,   94, 1046, 1014,  684,  869, 2037, 1125, 1004, 1658,
+         1181,   37, 1749, 2047, 1426, 1348, 2037, 1125, 1004, 1544,  573,  885,
+         1749, 1803, 1426, 1348])
+        assert torch.allclose(zs[-2][0,:40], EXPECTED_OUTPUT)
 
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 4096 - zs[-2].shape[-1]).cpu()), dim=-1)
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
         # TODO find the expected outputs
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                1489,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                1434,
-                1434,
-                653,
-                1357,
-                653,
-                1434,
-                1434,
-                1536,
-                1599,
-                710,
-            ]
-        )
-        assert torch.allclose(zs[0][0, :30], EXPECTED_OUTPUT)
+        EXPECTED_OUTPUT = torch.tensor([904, 2037,  343, 1372,  135,  717,  506,  157,  307, 1419, 1751,  343,
+          899, 1803,  573,   94, 1046, 1014,  684,  869, 2037, 1125, 1004, 1658,
+         1181,   37, 1749, 2047, 1426, 1348, 2037, 1125, 1004, 1544,  573,  885,
+         1749, 1803, 1426, 1348])
+        assert torch.allclose(zs[0][0, :40], EXPECTED_OUTPUT)
 
     def test_5b_lyrics(self):        
         set_seed(0)

From 67d2c680ced30df2ff46a517752aba73d8ef9cb6 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 5 Jul 2022 15:30:58 +0200
Subject: [PATCH 020/196] refactoir toeknizer

---
 .../models/jukebox/tokenization_jukebox.py    | 89 ++++++++++++++-----
 1 file changed, 66 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index d081fe267915f..af77cf30c15b3 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -30,13 +30,21 @@
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
+    "artists_file": "artists.json",
+    "lyrics_file": "lyrics.json",
+    "genres_file": "genres.json",
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "jukebox": "https://huggingface.co/ArthurZ/jukebox/blob/main/vocab.json",
-    }
+    "artists_file": {
+        "jukebox": "https://huggingface.co/ArthurZ/jukebox/blob/main/artists.json",
+    },
+    "genres_file": {
+        "jukebox": "https://huggingface.co/ArthurZ/jukebox/blob/main/genres.json",
+    },
+    "lyrics_file": {
+        "jukebox": "https://huggingface.co/ArthurZ/jukebox/blob/main/lyrics.json",
+    },
 }
 
 PRETRAINED_LYRIC_TOKENS_SIZES = {
@@ -100,19 +108,36 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     max_lyric_input_size = PRETRAINED_LYRIC_TOKENS_SIZES
     model_input_names = ["input_ids", "attention_mask"]
 
-    def __init__(self, vocab_file, max_n_lyric_tokens=512, n_genres=5, unk_token="<|endoftext|>", **kwargs):
+    def __init__(
+        self,
+        artists_file,
+        genres_file,
+        lyrics_file,
+        version = ["v2","v3","v3"],
+        max_n_lyric_tokens=512,
+        n_genres=5,
+        unk_token="<|endoftext|>",
+        **kwargs
+    ):
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         super().__init__(
             unk_token=unk_token,
             **kwargs,
         )
+        self.version = version
         self.max_n_lyric_tokens = max_n_lyric_tokens
         self.n_genres = n_genres
 
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
+        with open(artists_file, encoding="utf-8") as vocab_handle:
             vocabulary = json.load(vocab_handle)
             self.artists_encoder = vocabulary["artists"]
+
+        with open(genres_file, encoding="utf-8") as vocab_handle:
+            vocabulary = json.load(vocab_handle)
             self.genres_encoder = vocabulary["genres"]
+
+        with open(lyrics_file, encoding="utf-8") as vocab_handle:
+            vocabulary = json.load(vocab_handle)
             self.lyrics_encoder = vocabulary["lyrics"]
 
         self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")  # FIXME: should be an argument?
@@ -185,7 +210,6 @@ def _convert_token_to_id(self, artist, genres, lyrics, total_length, offset, dur
         artists_id = self.artists_encoder.get(artist)
         genres_ids = [self.genres_encoder.get(genre) for genre in genres]
         lyric_ids = [self.lyrics_encoder.get(character) for character in lyrics]
-        lyric_ids = self.get_relevant_lyric_tokens(lyric_ids, total_length, offset, duration)
         return artists_id, genres_ids, lyric_ids
 
     def _tokenize(self, lyrics):
@@ -241,6 +265,15 @@ def prepare_for_tokenization(
         Returns:
             `Tuple[str, str, str, Dict[str, Any]]`: The prepared text and the unused kwargs.
         """
+        for idx,version in enumerate(self.version): 
+            if version == "v1":
+                artist = artist.lower()
+                genres = genres.lower()
+                lyrics = lyrics.lower()
+            else: 
+                artist[idx] = self._normalize(artist)
+                
+                
         artist = self._normalize(artist)
         genres = self._normalize(genres).split("_")  # split is for the full dictionnary with combined genres
 
@@ -291,8 +324,8 @@ def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
     # a type argument to add an artist token with self.getattr('artist') ?
     # TODO : is a call function required ?
 
-    def __call__(self, artist, genres, lyrics, total_length, sample_length, offset):
-        """Convert the raw string to token ids
+    def __call__(self, artist, genres, lyrics, total_length, offset):
+        """Convert the raw string to a list of token ids
 
         Args:
             artist (`_type_`):
@@ -303,15 +336,17 @@ def __call__(self, artist, genres, lyrics, total_length, sample_length, offset):
                 _description_
             total_length (`_type_`):
                 _description_
-            sample_length (`_type_`):
-                _description_
             offset (`_type_`):
                 _description_
         """
-        input_ids = [total_length, offset, sample_length]
+        input_ids = [total_length, offset, None]
+        artist = artist*len(self.version)
+        genres = genres*len(self.version)
+        lyrics = lyrics*len(self.version)
+        
         artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
         artists_id, genres_ids, lyric_ids = self._convert_token_to_id(
-            artists_tokens, genres_tokens, lyrics_tokens, total_length, offset, sample_length
+            artists_tokens, genres_tokens, lyrics_tokens, total_length, offset
         )
         input_ids += [artists_id] + genres_ids + lyric_ids
         attention_masks = [-INFINITY] * (self.max_n_lyric_tokens - len(lyrics_tokens)) + [0] * len(lyrics_tokens)
@@ -330,15 +365,23 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
             return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+
+        artists_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["artists_file"]
         )
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(
-                json.dumps(
-                    {"artists": self.artists_encoder, "genres": self.genres_encoder, "lyrics": self.lyrics_encoder},
-                    ensure_ascii=False,
-                )
-            )
+        with open(artists_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.artists_encoder, ensure_ascii=False))
+
+        genres_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["genres_file"]
+        )
+        with open(genres_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.genres_encoder, ensure_ascii=False))
+
+        lyrics_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["lyrics_file"]
+        )
+        with open(lyrics_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.lyrics_encoder, ensure_ascii=False))
 
-        return (vocab_file,)
+        return (artists_file, genres_file, lyrics_file)

From 36dee3e5e21c4a09c4e0cd01621a5107727cba4c Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 5 Jul 2022 17:00:46 +0200
Subject: [PATCH 021/196] update

---
 .../models/jukebox/tokenization_jukebox.py    | 100 ++++++++++--------
 1 file changed, 54 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index f0a36781f0879..b8e724e2b5f94 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -167,17 +167,15 @@ def __init__(
         self.n_genres = n_genres
 
         with open(artists_file, encoding="utf-8") as vocab_handle:
-            vocabulary = json.load(vocab_handle)
-            self.artists_encoder = vocabulary["artists"]
+            self.artists_encoder  = json.load(vocab_handle)
+            
 
         with open(genres_file, encoding="utf-8") as vocab_handle:
-            vocabulary = json.load(vocab_handle)
-            self.genres_encoder = vocabulary["genres"]
-
+            self.genres_encoder  = json.load(vocab_handle)
+           
         with open(lyrics_file, encoding="utf-8") as vocab_handle:
-            vocabulary = json.load(vocab_handle)
-            self.lyrics_encoder = vocabulary["lyrics"]
-
+            self.lyrics_encoder  = json.load(vocab_handle)
+            
         oov = '[^A-Za-z0-9.,:;!?\-\'\"()\[\] \t\n]+'
         # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
         if len(self.lyrics_encoder) == 79:
@@ -195,7 +193,7 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.artists_encoder, self.genres_encoder, self.lyrics_encoder)
 
-    def _convert_token_to_id(self, artist, genres, lyrics, total_length, offset, duration):
+    def _convert_token_to_id(self, artist, genres, lyrics):
         """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
         The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
         the lyrics token sequence.
@@ -248,7 +246,7 @@ def tokenize(self, artist, genre, lyrics, **kwargs):
         return artist, genre, lyrics
 
     def prepare_for_tokenization(
-        self, artist: str, genres: str, lyrics: str, is_split_into_words: bool = False, **kwargs
+        self, artists: str, genres: str, lyrics: str, is_split_into_words: bool = False, **kwargs
     ) -> Tuple[str, str, str, Dict[str, Any]]:
         """
         Performs any necessary transformations before tokenization.
@@ -273,22 +271,31 @@ def prepare_for_tokenization(
         Returns:
             `Tuple[str, str, str, Dict[str, Any]]`: The prepared text and the unused kwargs.
         """
-        for idx,version in enumerate(self.version): 
-            if version == "v1":
-                artist = artist.lower()
-                genres = genres.lower()
-                lyrics = lyrics.lower()
+        for idx in range(len(self.version)): 
+            if self.version[idx] == "v3":
+                artists[idx] = artists[idx].lower() 
+                genres[idx] = [genres[idx].lower()]
             else: 
-                artist[idx] = self._normalize(artist)
+                artists[idx] = self._normalize(artists[idx])+ ".v2"
+                genres[idx] = self._normalize(genres[idx]+ ".v2").split("_")  # split is for the full dictionnary with combined genres
                 
                 
-        artist = self._normalize(artist)
-        genres = self._normalize(genres).split("_")  # split is for the full dictionnary with combined genres
+        if self.version[idx] == "v3":
+            self.out_of_vocab = re.compile('[^A-Za-z0-9.,:;!?\-\'\"()\[\] \t\n]+')
+            vocab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+\'\"()[] \t\n'
+            self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
+            self.vocab['<unk>'] = 0
+            self.n_vocab = len(vocab) + 1
+            self.lyrics_encoder = {v: k for k, v in self.vocab.items()}
+            self.lyrics_encoder[0] = '' 
+        else: 
+            self.out_of_vocab = re.compile('[^A-Za-z0-9.,:;!?\-+\'\"()\[\] \t\n]+')
+            
         normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.StripAccents()])
         lyrics = normalizer.normalize_str(lyrics)
         lyrics = lyrics.replace("\\", "\n")
-        lyrics = self.out_of_vocab.sub("", lyrics)
-        return artist, genres, lyrics, kwargs
+        lyrics = [],[],self.out_of_vocab.sub("", lyrics)
+        return artists, genres, lyrics, kwargs
 
     def _normalize(self, text: str) -> str:
         """Normalizes the input text. This process is for the genres and the artit
@@ -302,7 +309,7 @@ def _normalize(self, text: str) -> str:
         accepted = (
             [chr(i) for i in range(ord("a"), ord("z") + 1)]
             + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
-            + [chr(i) for i in range(ord("0"), ord("9") + 1)]
+            + [chr(i) for i in range(ord("0"), ord("9") + 1)] + ['.']
         )
 
         # In v2, " " is not accepted while it is for v3
@@ -314,20 +321,6 @@ def _normalize(self, text: str) -> str:
         text = rex.sub("_", text).strip("_")
         return text
 
-    def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
-        """Converts an index (integer) in a token (str) using the vocab.
-        Args:
-            artists_index (`int`):
-                Index of the artist in its corresponding dictionnary.
-            genres_index (`Union[List[int], int]`):
-               Index of the genre in its corresponding dictionnary.
-            lyric_index (`List[int]`):
-                List of character indices, which each correspond to a character.
-        """
-        artist = self.artists_decoder.get(artists_index)
-        genres = [self.genres_decoder.get(genre) for genre in genres_index]
-        lyrics = [self.lyrics_decoder.get(character) for character in lyric_index]
-        return artist, genres, lyrics
 
     def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
         return " ".join(lyrics)
@@ -335,7 +328,7 @@ def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
     # TODO : should add_token be implemeted for artists, genres and lyrics? Should it have
     # a type argument to add an artist token with self.getattr('artist') ?
 
-    def __call__(self, artist, genres, lyrics, total_length, offset):
+    def __call__(self, artist, genres, lyrics, total_length, offset, return_tensor = "pt"):
         """Convert the raw string to a list of token ids
 
         Args:
@@ -350,23 +343,23 @@ def __call__(self, artist, genres, lyrics, total_length, offset):
             offset (`_type_`):
                 _description_
         """
-        input_ids = [total_length, offset, None]
-        artist = artist*len(self.version)
-        genres = genres*len(self.version)
-        lyrics = lyrics*len(self.version)
+        input_ids = [None, None, None]
+        artist = [artist]*len(self.version)
+        genres = [genres]*len(self.version)
+        
         
         artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
-        artists_id, genres_ids, lyric_ids = self._convert_token_to_id(
+        artists_id, genres_ids, full_tokens = self._convert_token_to_id(
             artists_tokens, genres_tokens, lyrics_tokens, total_length, offset
         )
-        input_ids += [artists_id] + genres_ids + relevant_tokens
-        attention_masks = [-INFINITY] * (len(full_tokens) - len(relevant_tokens)) + [0] * len(relevant_tokens)
+          
+        attention_masks = [-INFINITY] * len(full_tokens[-1])
         # TODO properly handle the return pt tensor option
         if return_tensor == "pt":
-            return {
-                "input_ids": {"y": torch.tensor([input_ids]), "full_tokens": full_tokens},
+            return [{
+                "input_ids":  input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i],
                 "attention_masks": torch.tensor(attention_masks),
-            }
+            } for i in range(len(self.version))]
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
@@ -401,3 +394,18 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             f.write(json.dumps(self.lyrics_encoder, ensure_ascii=False))
 
         return (artists_file, genres_file, lyrics_file)
+    
+    def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
+        """Converts an index (integer) in a token (str) using the vocab.
+        Args:
+            artists_index (`int`):
+                Index of the artist in its corresponding dictionnary.
+            genres_index (`Union[List[int], int]`):
+               Index of the genre in its corresponding dictionnary.
+            lyric_index (`List[int]`):
+                List of character indices, which each correspond to a character.
+        """
+        artist = self.artists_decoder.get(artists_index)
+        genres = [self.genres_decoder.get(genre) for genre in genres_index]
+        lyrics = [self.lyrics_decoder.get(character) for character in lyric_index]
+        return artist, genres, lyrics

From 4b56ed9ffee968d57e7a9c6c42da40f2e07ee132 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Wed, 6 Jul 2022 07:01:59 +0200
Subject: [PATCH 022/196] fix tokenization

---
 .../models/jukebox/tokenization_jukebox.py    | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index b8e724e2b5f94..b655ca8138a76 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -149,7 +149,7 @@ def __init__(
         artists_file,
         genres_file,
         lyrics_file,
-        version = ["v2","v3","v3"],
+        version = ["v3","v2","v2"],
         max_n_lyric_tokens=512,
         n_genres=5,
         unk_token="<|endoftext|>",
@@ -159,6 +159,7 @@ def __init__(
         super().__init__(
             unk_token=unk_token,
             n_genres=n_genres, 
+            version = version,
             max_n_lyric_tokens=max_n_lyric_tokens,
             **kwargs,
         )
@@ -193,7 +194,7 @@ def vocab_size(self):
     def get_vocab(self):
         return dict(self.artists_encoder, self.genres_encoder, self.lyrics_encoder)
 
-    def _convert_token_to_id(self, artist, genres, lyrics):
+    def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
         """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
         The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
         the lyrics token sequence.
@@ -212,11 +213,13 @@ def _convert_token_to_id(self, artist, genres, lyrics):
             duration (`_type_`):
                 _description_
         """
-        artists_id = self.artists_encoder.get(artist)
-        genres_ids = [self.genres_encoder.get(genre) for genre in genres]
-        genres_ids = genres_ids + [-1] * (self.n_genres - len(genres_ids))
-        lyric_ids = [self.lyrics_encoder.get(character) for character in lyrics]
-        return artists_id, genres_ids, lyric_ids
+        artists_id = [self.artists_encoder.get(artist) for artist in list_artists ]
+        for genres in range(len(list_genres)):
+            list_genres[genres] = [self.genres_encoder.get(genre) for genre in list_genres[genres]]
+            list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
+        
+        lyric_ids = [[],[], [self.lyrics_encoder.get(character) for character in list_lyrics[-1]]]
+        return artists_id, list_genres, lyric_ids
 
     def _tokenize(self, lyrics):
         """
@@ -286,8 +289,9 @@ def prepare_for_tokenization(
             self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
             self.vocab['<unk>'] = 0
             self.n_vocab = len(vocab) + 1
-            self.lyrics_encoder = {v: k for k, v in self.vocab.items()}
-            self.lyrics_encoder[0] = '' 
+            self.lyrics_encoder = self.vocab
+            self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
+            self.lyrics_decoder[0] = '' 
         else: 
             self.out_of_vocab = re.compile('[^A-Za-z0-9.,:;!?\-+\'\"()\[\] \t\n]+')
             
@@ -311,10 +315,6 @@ def _normalize(self, text: str) -> str:
             + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
             + [chr(i) for i in range(ord("0"), ord("9") + 1)] + ['.']
         )
-
-        # In v2, " " is not accepted while it is for v3
-        if len(self.lyrics_encoder) == 79:
-            accepted += [" "]
         accepted = frozenset(accepted)
         rex = re.compile(r"_+")
         text = "".join([c if c in accepted else "_" for c in text.lower()])
@@ -328,7 +328,7 @@ def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
     # TODO : should add_token be implemeted for artists, genres and lyrics? Should it have
     # a type argument to add an artist token with self.getattr('artist') ?
 
-    def __call__(self, artist, genres, lyrics, total_length, offset, return_tensor = "pt"):
+    def __call__(self, artist, genres, lyrics, return_tensor = "pt"):
         """Convert the raw string to a list of token ids
 
         Args:
@@ -343,14 +343,14 @@ def __call__(self, artist, genres, lyrics, total_length, offset, return_tensor =
             offset (`_type_`):
                 _description_
         """
-        input_ids = [None, None, None]
+        input_ids = [0, 0, 0]
         artist = [artist]*len(self.version)
         genres = [genres]*len(self.version)
         
         
         artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
         artists_id, genres_ids, full_tokens = self._convert_token_to_id(
-            artists_tokens, genres_tokens, lyrics_tokens, total_length, offset
+            artists_tokens, genres_tokens, lyrics_tokens
         )
           
         attention_masks = [-INFINITY] * len(full_tokens[-1])

From 92bed83693f66da8cf9a584233a71d00afc70eaa Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Wed, 6 Jul 2022 07:03:42 +0200
Subject: [PATCH 023/196] style

---
 .../models/jukebox/tokenization_jukebox.py    |  92 ++++-----
 tests/models/jukebox/test_modeling_jukebox.py | 193 ++++++++++++++----
 2 files changed, 204 insertions(+), 81 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index b655ca8138a76..c9c484ade524a 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -149,7 +149,7 @@ def __init__(
         artists_file,
         genres_file,
         lyrics_file,
-        version = ["v3","v2","v2"],
+        version=["v3", "v2", "v2"],
         max_n_lyric_tokens=512,
         n_genres=5,
         unk_token="<|endoftext|>",
@@ -158,8 +158,8 @@ def __init__(
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         super().__init__(
             unk_token=unk_token,
-            n_genres=n_genres, 
-            version = version,
+            n_genres=n_genres,
+            version=version,
             max_n_lyric_tokens=max_n_lyric_tokens,
             **kwargs,
         )
@@ -168,21 +168,20 @@ def __init__(
         self.n_genres = n_genres
 
         with open(artists_file, encoding="utf-8") as vocab_handle:
-            self.artists_encoder  = json.load(vocab_handle)
-            
+            self.artists_encoder = json.load(vocab_handle)
 
         with open(genres_file, encoding="utf-8") as vocab_handle:
-            self.genres_encoder  = json.load(vocab_handle)
-           
+            self.genres_encoder = json.load(vocab_handle)
+
         with open(lyrics_file, encoding="utf-8") as vocab_handle:
-            self.lyrics_encoder  = json.load(vocab_handle)
-            
-        oov = '[^A-Za-z0-9.,:;!?\-\'\"()\[\] \t\n]+'
+            self.lyrics_encoder = json.load(vocab_handle)
+
+        oov = "[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
         # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
         if len(self.lyrics_encoder) == 79:
-            oov = oov.replace("\-\'","\-+\'")
+            oov = oov.replace("\-'", "\-+'")
 
-        self.out_of_vocab = re.compile(oov) 
+        self.out_of_vocab = re.compile(oov)
         self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
         self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
         self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
@@ -213,12 +212,12 @@ def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
             duration (`_type_`):
                 _description_
         """
-        artists_id = [self.artists_encoder.get(artist) for artist in list_artists ]
+        artists_id = [self.artists_encoder.get(artist) for artist in list_artists]
         for genres in range(len(list_genres)):
             list_genres[genres] = [self.genres_encoder.get(genre) for genre in list_genres[genres]]
             list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
-        
-        lyric_ids = [[],[], [self.lyrics_encoder.get(character) for character in list_lyrics[-1]]]
+
+        lyric_ids = [[], [], [self.lyrics_encoder.get(character) for character in list_lyrics[-1]]]
         return artists_id, list_genres, lyric_ids
 
     def _tokenize(self, lyrics):
@@ -274,31 +273,32 @@ def prepare_for_tokenization(
         Returns:
             `Tuple[str, str, str, Dict[str, Any]]`: The prepared text and the unused kwargs.
         """
-        for idx in range(len(self.version)): 
+        for idx in range(len(self.version)):
             if self.version[idx] == "v3":
-                artists[idx] = artists[idx].lower() 
+                artists[idx] = artists[idx].lower()
                 genres[idx] = [genres[idx].lower()]
-            else: 
-                artists[idx] = self._normalize(artists[idx])+ ".v2"
-                genres[idx] = self._normalize(genres[idx]+ ".v2").split("_")  # split is for the full dictionnary with combined genres
-                
-                
+            else:
+                artists[idx] = self._normalize(artists[idx]) + ".v2"
+                genres[idx] = self._normalize(genres[idx] + ".v2").split(
+                    "_"
+                )  # split is for the full dictionnary with combined genres
+
         if self.version[idx] == "v3":
-            self.out_of_vocab = re.compile('[^A-Za-z0-9.,:;!?\-\'\"()\[\] \t\n]+')
-            vocab = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+\'\"()[] \t\n'
+            self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
+            vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
             self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
-            self.vocab['<unk>'] = 0
+            self.vocab["<unk>"] = 0
             self.n_vocab = len(vocab) + 1
             self.lyrics_encoder = self.vocab
             self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
-            self.lyrics_decoder[0] = '' 
-        else: 
-            self.out_of_vocab = re.compile('[^A-Za-z0-9.,:;!?\-+\'\"()\[\] \t\n]+')
-            
+            self.lyrics_decoder[0] = ""
+        else:
+            self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
+
         normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.StripAccents()])
         lyrics = normalizer.normalize_str(lyrics)
         lyrics = lyrics.replace("\\", "\n")
-        lyrics = [],[],self.out_of_vocab.sub("", lyrics)
+        lyrics = [], [], self.out_of_vocab.sub("", lyrics)
         return artists, genres, lyrics, kwargs
 
     def _normalize(self, text: str) -> str:
@@ -313,7 +313,8 @@ def _normalize(self, text: str) -> str:
         accepted = (
             [chr(i) for i in range(ord("a"), ord("z") + 1)]
             + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
-            + [chr(i) for i in range(ord("0"), ord("9") + 1)] + ['.']
+            + [chr(i) for i in range(ord("0"), ord("9") + 1)]
+            + ["."]
         )
         accepted = frozenset(accepted)
         rex = re.compile(r"_+")
@@ -321,14 +322,13 @@ def _normalize(self, text: str) -> str:
         text = rex.sub("_", text).strip("_")
         return text
 
-
     def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
         return " ".join(lyrics)
 
     # TODO : should add_token be implemeted for artists, genres and lyrics? Should it have
     # a type argument to add an artist token with self.getattr('artist') ?
 
-    def __call__(self, artist, genres, lyrics, return_tensor = "pt"):
+    def __call__(self, artist, genres, lyrics, return_tensor="pt"):
         """Convert the raw string to a list of token ids
 
         Args:
@@ -344,22 +344,22 @@ def __call__(self, artist, genres, lyrics, return_tensor = "pt"):
                 _description_
         """
         input_ids = [0, 0, 0]
-        artist = [artist]*len(self.version)
-        genres = [genres]*len(self.version)
-        
-        
+        artist = [artist] * len(self.version)
+        genres = [genres] * len(self.version)
+
         artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
-        artists_id, genres_ids, full_tokens = self._convert_token_to_id(
-            artists_tokens, genres_tokens, lyrics_tokens
-        )
-          
+        artists_id, genres_ids, full_tokens = self._convert_token_to_id(artists_tokens, genres_tokens, lyrics_tokens)
+
         attention_masks = [-INFINITY] * len(full_tokens[-1])
         # TODO properly handle the return pt tensor option
         if return_tensor == "pt":
-            return [{
-                "input_ids":  input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i],
-                "attention_masks": torch.tensor(attention_masks),
-            } for i in range(len(self.version))]
+            return [
+                {
+                    "input_ids": input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i],
+                    "attention_masks": torch.tensor(attention_masks),
+                }
+                for i in range(len(self.version))
+            ]
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
@@ -394,7 +394,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
             f.write(json.dumps(self.lyrics_encoder, ensure_ascii=False))
 
         return (artists_file, genres_file, lyrics_file)
-    
+
     def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
         """Converts an index (integer) in a token (str) using the vocab.
         Args:
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index e23a7a78eab33..4c563020e293f 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -42,8 +42,7 @@ class JukeboxModelTest(unittest.TestCase):
         artist="Zac Brown Band",
         genres="Country",
         offset=0,
-        lyrics=
-    """I met a traveller from an antique land,
+        lyrics="""I met a traveller from an antique land,
     Who said "Two vast and trunkless legs of stone
     Stand in the desert. . . . Near them, on the sand,
     Half sunk a shattered visage lies, whose frown,
@@ -572,79 +571,206 @@ def test_conditioning(self):
         # start and end embeding
         # expected conditioning to match
 
-    def prepare_inputs(self,model, model_id, chunk_size = 32):
+    def prepare_inputs(self, model, model_id, chunk_size=32):
         tokenizer = JukeboxTokenizer.from_pretrained(model_id)
-         # create sampling parameters
+        # create sampling parameters
         sampling_temperature = 0.98
         lower_batch_size = 16
         max_batch_size = 16
         sampling_kwargs = [
             dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=chunk_size, sample_tokens=10),
             dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=chunk_size, sample_tokens=10),
-            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size, sample_tokens=10),
+            dict(
+                temp=sampling_temperature,
+                fp16=False,
+                max_batch_size=max_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+            ),
         ]
 
-        sample_length_in_seconds = 24     
+        sample_length_in_seconds = 24
         top_prior = model.priors[-1]
-        total_length = (int(sample_length_in_seconds*model.config.sr)//top_prior.raw_to_tokens)*top_prior.raw_to_tokens
-        tokens = tokenizer(**self.metas, sample_length =  top_prior.sample_length, total_length = total_length)
+        total_length = (
+            int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens
+        ) * top_prior.raw_to_tokens
+        tokens = tokenizer(**self.metas, sample_length=top_prior.sample_length, total_length=total_length)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
 
-       
         labels = [inputs.copy() for i in range(3)]
-        labels[1]["y"] = labels[1]["y"][:, :(4+tokenizer.n_genres)]
-        labels[0]["y"] = labels[0]["y"][:, :(4+tokenizer.n_genres)]
+        labels[1]["y"] = labels[1]["y"][:, : (4 + tokenizer.n_genres)]
+        labels[0]["y"] = labels[0]["y"][:, : (4 + tokenizer.n_genres)]
 
         return labels, sampling_kwargs
 
-    # @slow 
+    # @slow
     def test_1b_lyrics(self):
         torch.backends.cuda.matmul.allow_tf32 = False
         torch.backends.cudnn.enabled = False
 
         model_id = "ArthurZ/jukebox-1b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id).eval() 
-        
-        labels, sampling_kwargs = self.prepare_inputs(model,model_id)
+        model = JukeboxModel.from_pretrained(model_id).eval()
+
+        labels, sampling_kwargs = self.prepare_inputs(model, model_id)
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
 
         # TODO generate the original outputs
-        EXPECTED_OUTPUT = torch.tensor([1864, 1536, 1213, 1869, 1321, 1597,  519,  947, 1177,  789, 1434,  653,
-          653,  653,  653,  653,  653,  653,  653,  653, 1007, 1472,  255, 1228,
-          555, 1272, 1379, 1423, 1673,  427, 1683, 1321,  475,  416, 1177, 1827,
-         1106, 1127, 1494,  812])
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                1864,
+                1536,
+                1213,
+                1869,
+                1321,
+                1597,
+                519,
+                947,
+                1177,
+                789,
+                1434,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                653,
+                1007,
+                1472,
+                255,
+                1228,
+                555,
+                1272,
+                1379,
+                1423,
+                1673,
+                427,
+                1683,
+                1321,
+                475,
+                416,
+                1177,
+                1827,
+                1106,
+                1127,
+                1494,
+                812,
+            ]
+        )
         assert torch.allclose(zs[-1][0], EXPECTED_OUTPUT)
 
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
         # TODO find the expected outputs
-        EXPECTED_OUTPUT = torch.tensor([904, 2037,  343, 1372,  135,  717,  506,  157,  307, 1419, 1751,  343,
-          899, 1803,  573,   94, 1046, 1014,  684,  869, 2037, 1125, 1004, 1658,
-         1181,   37, 1749, 2047, 1426, 1348, 2037, 1125, 1004, 1544,  573,  885,
-         1749, 1803, 1426, 1348])
-        assert torch.allclose(zs[-2][0,:40], EXPECTED_OUTPUT)
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                904,
+                2037,
+                343,
+                1372,
+                135,
+                717,
+                506,
+                157,
+                307,
+                1419,
+                1751,
+                343,
+                899,
+                1803,
+                573,
+                94,
+                1046,
+                1014,
+                684,
+                869,
+                2037,
+                1125,
+                1004,
+                1658,
+                1181,
+                37,
+                1749,
+                2047,
+                1426,
+                1348,
+                2037,
+                1125,
+                1004,
+                1544,
+                573,
+                885,
+                1749,
+                1803,
+                1426,
+                1348,
+            ]
+        )
+        assert torch.allclose(zs[-2][0, :40], EXPECTED_OUTPUT)
 
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
         # TODO find the expected outputs
-        EXPECTED_OUTPUT = torch.tensor([904, 2037,  343, 1372,  135,  717,  506,  157,  307, 1419, 1751,  343,
-          899, 1803,  573,   94, 1046, 1014,  684,  869, 2037, 1125, 1004, 1658,
-         1181,   37, 1749, 2047, 1426, 1348, 2037, 1125, 1004, 1544,  573,  885,
-         1749, 1803, 1426, 1348])
+        EXPECTED_OUTPUT = torch.tensor(
+            [
+                904,
+                2037,
+                343,
+                1372,
+                135,
+                717,
+                506,
+                157,
+                307,
+                1419,
+                1751,
+                343,
+                899,
+                1803,
+                573,
+                94,
+                1046,
+                1014,
+                684,
+                869,
+                2037,
+                1125,
+                1004,
+                1658,
+                1181,
+                37,
+                1749,
+                2047,
+                1426,
+                1348,
+                2037,
+                1125,
+                1004,
+                1544,
+                573,
+                885,
+                1749,
+                1803,
+                1426,
+                1348,
+            ]
+        )
         assert torch.allclose(zs[0][0, :40], EXPECTED_OUTPUT)
 
-    def test_5b_lyrics(self):        
+    def test_5b_lyrics(self):
         set_seed(0)
         torch.backends.cuda.matmul.allow_tf32 = False
         torch.backends.cudnn.enabled = False
 
         model_id = "ArthurZ/jukebox-5b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id).eval() 
-        
-        labels, sampling_kwargs = self.prepare_inputs(model_id, model.priors[-1].sample_length, chunk_size = 16)
+        model = JukeboxModel.from_pretrained(model_id).eval()
+
+        labels, sampling_kwargs = self.prepare_inputs(model_id, model.priors[-1].sample_length, chunk_size=16)
 
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(len(model.priors))]
         zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
@@ -684,7 +810,6 @@ def test_5b_lyrics(self):
         )
         assert torch.allclose(zs[-1][0, :30], EXPECTED_OUTPUT)
 
-
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
         zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
         # TODO find the expected outputs
@@ -768,5 +893,3 @@ def test_5b_lyrics(self):
     tester = JukeboxModelTest()
     tester.test_1b_lyrics()
     tester.test_5b_lyrics()
-
-

From 3bfc36c6dba4bc9ffed82ad0333f8da0b22d9110 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Wed, 6 Jul 2022 12:09:08 +0200
Subject: [PATCH 024/196] major refactoring

---
 .../models/jukebox/modeling_jukebox.py        | 329 +++++++-----------
 tests/models/jukebox/test_modeling_jukebox.py |  43 ++-
 2 files changed, 145 insertions(+), 227 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 71c235ad2c36a..cbb6ace2ee0ba 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -25,8 +25,6 @@
 from packaging import version
 from torch import nn
 
-from rich.progress import Progress
-
 
 if version.parse(torch.__version__) >= version.parse("1.6"):
     is_amp_available = True
@@ -61,6 +59,17 @@ def empty_cache():
     torch.cuda.empty_cache()
 
 
+import sys
+
+import tqdm
+
+
+def get_range(x):
+    return tqdm(
+        x, leave=True, file=sys.stdout, bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
+    )
+
+
 ####################################################################
 # Attention and scalable transformer
 # Import FusedLayerNorm if we have apex, otherwise use regular LayerNorm
@@ -82,7 +91,7 @@ def __init__(self, n_in, n_out, zero_out=False, init_scale=1.0):
             w = torch.zeros(n_in, n_out)
         else:
             w = torch.empty(n_in, n_out)
-            nn.init.normal_(w, std=0.02 * init_scale)
+
         b = torch.zeros(n_out)
         self.weight = nn.Parameter(w)  # modified self.w
         self.bias = nn.Parameter(b)
@@ -99,6 +108,7 @@ def forward(self, x):
 class ResConvBlock(nn.Module):
     def __init__(self, n_in, n_state):
         super().__init__()
+        # TODO remvove the sequential in favor of a more understanble code
         self.model = nn.Sequential(
             nn.ReLU(),
             nn.Conv2d(n_in, n_state, 3, 1, 1),
@@ -113,6 +123,8 @@ def forward(self, x):
 class Resnet(nn.Module):
     def __init__(self, n_in, n_depth, m_conv=1.0):
         super().__init__()
+        # TODO remvove the sequential in favor of a more understanble code
+        # the list comprehension is maybe not very readable
         self.model = nn.Sequential(*[ResConvBlock(n_in, int(m_conv * n_in)) for _ in range(n_depth)])
 
     def forward(self, x):
@@ -123,12 +135,14 @@ class ResConv1DBlock(nn.Module):
     def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scale=1.0):
         super().__init__()
         padding = dilation
+        # TODO remvove the sequential in favor of a more understanble code
         self.model = nn.Sequential(
             nn.ReLU(),
             nn.Conv1d(n_in, n_state, 3, 1, padding, dilation),
             nn.ReLU(),
             nn.Conv1d(n_state, n_in, 1, 1, 0),
         )
+        # TODO remvove the initialisation scheme
         if zero_out:
             out = self.model[-1]
             nn.init.zeros_(out.weight)
@@ -160,6 +174,8 @@ def _get_depth(depth):
             else:
                 return depth % dilation_cycle
 
+        # TODO remvove comprehension in favor of a for loop more understanbnle
+
         blocks = [
             ResConv1DBlock(
                 n_in,
@@ -172,22 +188,11 @@ def _get_depth(depth):
         ]
         if reverse_dilation:
             blocks = blocks[::-1]
-        self.checkpoint_res = checkpoint_res
-        # if self.checkpoint_res == 1:
-        #     # if dist.get_rank() == 0:
-        #     #     print("Checkpointing convs")
-        #     self.blocks = nn.ModuleList(blocks)
-        # else:
-        #    self.model = nn.Sequential(*blocks)
+
+        # TODO remvove the sequential in favor of a more understanble code
         self.model = nn.Sequential(*blocks)
 
     def forward(self, x):
-        # if self.checkpoint_res == 1:
-        #     for block in self.blocks:
-        #         x = checkpoint(block, (x, ), block.parameters(), True)
-        #     return x
-        # else:
-        #     return self.model(x)
         return self.model(x)
 
 
@@ -211,6 +216,7 @@ def __init__(
         filter_t, pad_t = stride_t * 2, stride_t // 2
         if down_t > 0:
             for i in range(down_t):
+                # TODO remvove the sequential in favor of a more understanble code
                 block = nn.Sequential(
                     nn.Conv1d(input_emb_width if i == 0 else width, width, filter_t, stride_t, pad_t),
                     Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out, res_scale),
@@ -294,13 +300,13 @@ def level_block(level, down_t, stride_t):
             )
 
         self.level_blocks = nn.ModuleList()
+        # TODO remvove iterator
+
         iterator = zip(list(range(self.levels)), downs_t, strides_t)
         for level, down_t, stride_t in iterator:
             self.level_blocks.append(level_block(level, down_t, stride_t))
 
     def forward(self, x):
-        # N, T = x.shape[0], x.shape[-1]
-        # emb = self.input_emb_width
         xs = []
 
         # 64, 32, ...
@@ -308,8 +314,6 @@ def forward(self, x):
         for level, down_t, stride_t in iterator:
             level_block = self.level_blocks[-level - 1]
             x = level_block(x)
-            # emb, T = self.output_emb_width, T // (stride_t**down_t)
-            # assert_shape(x, (N, emb, T))
             xs.append(x)
 
         return xs
@@ -321,9 +325,7 @@ def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t
         self.input_emb_width = input_emb_width
         self.output_emb_width = output_emb_width
         self.levels = levels
-
         self.downs_t = downs_t
-
         self.strides_t = strides_t
 
         def level_block(level, down_t, stride_t):
@@ -342,17 +344,13 @@ def forward(self, xs, all_levels=True):
         else:
             assert len(xs) == 1
         x = xs[-1]
-        _, T = x.shape[0], x.shape[-1]
-        # emb = self.output_emb_width
-        # assert_shape(x, (N, emb, T))
 
         # 32, 64 ...
         iterator = reversed(list(zip(list(range(self.levels)), self.downs_t, self.strides_t)))
         for level, down_t, stride_t in iterator:
             level_block = self.level_blocks[level]
             x = level_block(x)
-            _, T = self.output_emb_width, T * (stride_t**down_t)
-            # assert_shape(x, (N, emb, T))
+
             if level != 0 and all_levels:
                 x = x + xs[level - 1]
 
@@ -409,11 +407,6 @@ def reset_k(self):
         self.init = False
         self.k_sum = None
         self.k_elem = None
-        # self.register_buffer('k',  torch.zeros(self.k_bins, self.emb_width).cuda())
-
-        # if torch.cuda.is_available():
-        #     self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width).to("cuda"))
-        # else:
         self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width))
 
     def _tile(self, x):
@@ -426,7 +419,9 @@ def _tile(self, x):
         return x
 
     def init_k(self, x):
-        _, emb_width, k_bins = self.mu, self.emb_width, self.k_bins  # mu,
+        # TODO rename x to a way more meaningful name
+
+        emb_width, k_bins = self.emb_width, self.k_bins  # mu,
         self.init = True
         # init k_w using random vectors from x
         y = self._tile(x)
@@ -438,9 +433,8 @@ def init_k(self, x):
         self.k_elem = torch.ones(k_bins, device=self.k.device)
 
     def restore_k(self, num_tokens=None, threshold=1.0):
-        _, emb_width, k_bins = self.mu, self.emb_width, self.k_bins  # mu -> _
+        emb_width, k_bins = self.emb_width, self.k_bins  # mu -> _
         self.init = True
-        assert self.k.shape == (k_bins, emb_width)
         self.k_sum = self.k.clone()
         self.k_elem = torch.ones(k_bins, device=self.k.device)
         if num_tokens is not None:
@@ -525,9 +519,11 @@ def encode(self, x):
 
         # Preprocess.
         x, prenorm = self.preprocess(x)
+        # TODO remvove unused prenorm variable
 
         # Quantise
         x_l, fit = self.quantise(x)
+        # TODO remvove unused fit and the return variable
 
         # Postprocess.
         x_l = x_l.view(N, T)
@@ -617,6 +613,7 @@ def forward(self, xs):
         return zs, xs_quantised, commit_losses, metrics
 
 
+# TODO replace FFT calls with torch.fft
 def stft(sig, hps):
     return torch.stft(
         sig,
@@ -627,10 +624,14 @@ def stft(sig, hps):
     )
 
 
+# TODO replace spec
 def spec(x, hps):
     return torch.norm(stft(x, hps), p=2, dim=-1)
 
 
+# TODO check if can be removed
+
+
 class DefaultSTFTValues:
     def __init__(self, hps):
         self.sr = hps.sr
@@ -773,7 +774,6 @@ def decoder(level):
 
     def preprocess(self, x):
         # x: NTC [-1,1] -> NCT [-1,1]
-        assert len(x.shape) == 3
         x = x.permute(0, 2, 1).float()
         return x
 
@@ -786,10 +786,7 @@ def _decode(self, zs, start_level=0, end_level=None):
         # Decode
         if end_level is None:
             end_level = self.levels
-        assert len(zs) == end_level - start_level
         xs_quantised = self.bottleneck.decode(zs, start_level=start_level, end_level=end_level)
-        assert len(xs_quantised) == end_level - start_level
-
         # Use only lowest level
         decoder, x_quantised = self.decoders[start_level], xs_quantised[0:1]
         x_out = decoder(x_quantised, all_levels=False)
@@ -828,6 +825,8 @@ def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
         return zs
 
     def sample(self, n_samples):
+        # TODO handle device properly
+
         zs = [torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes]
         return self.decode(zs)
 
@@ -936,6 +935,9 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+# TODO rename to JukeboxLayerNorm
+
+
 class LayerNorm(FusedLayerNorm):
     def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
         super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
@@ -1008,7 +1010,6 @@ def __init__(
         self.width = width  # should have a better name
         self.n_ctx = n_ctx  # NOTE: n_ctx could be different within operations. This is complete n_ctx
         self.n_state = n_state
-        assert n_state % n_head == 0
         self.n_head = n_head
         self.scale = scale
         self.mask = mask
@@ -1039,7 +1040,6 @@ def __init__(
         self.blocks = blocks
         self.spread = spread
         if blocks is not None:
-            assert n_ctx % blocks == 0
             self.block_ctx = n_ctx // blocks
         self.checkpoint_attn = checkpoint_attn  # 0: None, 1: Attn after heads split, 2: Attn
 
@@ -1097,10 +1097,6 @@ def dense_attn(self, query, key, value, sample):
         query = self.split_heads(query)
         key = self.split_heads(key, k=True)
         value = self.split_heads(value)
-        # if self.checkpoint_attn == 1 and not sample:
-        #     a = checkpoint(lambda q,k,v,s=sample: self._attn(q,k,v,s), (query, key, value),
-        #                (), True)
-        # else:
         a = self._attn(query, key, value, sample)
         a = self.merge_heads(a)
         return a
@@ -1291,20 +1287,10 @@ def prime_qkv(self, x, encoder_kv=None, sample=False):
                 self._slice_cache(0, self._prime_len)
             key, value = self.cache["key"], self.cache["value"]
             self.sample_t += curr_ctx
-            assert (
-                key.shape[1] == value.shape[1] == self._suff_cache_len()
-            ), f"k: {key.shape}, v: {value.shape}, prime_dims: {self._suff_cache_len()}"
-        else:
-            assert (
-                key.shape[1] == value.shape[1] == self.n_ctx
-            ), f"k: {key.shape}, v: {value.shape}, prime_dims: {self.n_ctx}"
-        assert key.shape[0] == value.shape[0] == query.shape[0], f"k: {key.shape}, v: {value.shape}, q: {query.shape}"
-        assert key.shape[2] == value.shape[2] == query.shape[2], f"k: {key.shape}, v: {value.shape}, q: {query.shape}"
         return query, key, value, sample
 
     def decode_qkv(self, x, encoder_kv=None, sample=False):
         curr_ctx = x.shape[1]
-        assert encoder_kv is not None
         query = x
         if sample:
             if self.sample_t == 0:
@@ -1313,20 +1299,12 @@ def decode_qkv(self, x, encoder_kv=None, sample=False):
             self.sample_t += curr_ctx
         else:
             key, value = self.c_enc_kv(encoder_kv.type_as(x)).chunk(2, dim=2)
-        assert key.shape[0] == value.shape[0] == query.shape[0], f"k: {key.shape}, v: {value.shape}, q: {query.shape}"
-        assert (
-            key.shape[1] == value.shape[1] == self.encoder_dims
-        ), f"k: {key.shape}, v: {value.shape}, enc_dims: {self.encoder_dims}"
-        assert key.shape[2] == value.shape[2] == query.shape[2], f"k: {key.shape}, v: {value.shape}, q: {query.shape}"
         return query, key, value, sample
 
     def forward(self, x, encoder_kv=None, sample=False):
         curr_ctx = x.shape[1]
         x = self.c_attn(x)
         query, key, value, sample = self.qkv(x, encoder_kv=encoder_kv, sample=sample)
-        # if self.checkpoint_attn == 2 and not sample:
-        #     a = checkpoint(lambda q,k,v,s=sample: self.attn(q,k,v,s), (query, key, value), (), True)
-        # else:
         a = self.attn(query, key, value, sample)
         if a.shape[1] != curr_ctx:
             offset = self._offset(curr_ctx)
@@ -1755,15 +1733,6 @@ def forward(self, x, encoder_kv=None, sample=False, fp16=False, fp16_out=False):
 
         # Blocks
         for i, l in enumerate(self._attn_mods):
-            # if self.checkpoint_res == 1 and not sample:
-            #     if l.attn_func == 6:
-            #         assert encoder_kv is not None
-            #         f = functools.partial(l, sample=sample)
-            #         x = checkpoint(f, (x, encoder_kv), l.parameters(), True)
-            #     else:
-            #         f = functools.partial(l, encoder_kv=None, sample=sample)
-            #         x = checkpoint(f, (x,), l.parameters(), True)
-            # else:
             if l.attn_func == 6:
                 x = l(x, encoder_kv=encoder_kv, sample=sample)
             else:
@@ -1815,14 +1784,6 @@ def __init__(self, input_shape, width, init_scale=1.0, pos_init=False):
         self.input_shape = input_shape
         self.input_dims = input_dims = np.prod(input_shape)
         self.pos_init = pos_init
-        # if pos_init:
-        #    self.register_buffer("pos", torch.tensor(get_pos_idx(input_shape)).long())
-        #    self._pos_embs = nn.ModuleList()
-        #    for i in range(len(input_shape)):
-        #        emb = nn.Embedding(input_shape[i], width)
-        #        nn.init.normal_(emb.weight, std=0.02)
-        #        self._pos_embs.append(emb)
-        # else:
         self.pos_emb = nn.Parameter(get_normal(input_dims, width, std=0.01 * init_scale))
 
     def forward(self):
@@ -1961,27 +1922,27 @@ def forward(
 
         N, D = x.shape
         # assert isinstance(x, torch.cuda.LongTensor)
-        assert (0 <= x).all() and (x < self.bins).all()
+        # assert (0 <= x).all() and (x < self.bins).all()
 
-        if self.y_cond:
-            assert y_cond is not None
-            assert y_cond.shape == (N, 1, self.width)
-        else:
-            assert y_cond is None
-
-        if self.x_cond:
-            assert x_cond is not None
-            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
-                N,
-                1,
-                self.width,
-            ), (
-                f"{x_cond.shape} != {(N, D, self.width)} nor {(N, 1, self.width)}. Did you pass the correct"
-                " --sample_length?"
-            )
-        else:
-            assert x_cond is None
-            x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
+        # if self.y_cond:
+        #     assert y_cond is not None
+        #     assert y_cond.shape == (N, 1, self.width)
+        # else:
+        #     assert y_cond is None
+
+        # if self.x_cond:
+        #     assert x_cond is not None
+        #     assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
+        #         N,
+        #         1,
+        #         self.width,
+        #     ), (
+        #         f"{x_cond.shape} != {(N, D, self.width)} nor {(N, 1, self.width)}. Did you pass the correct"
+        #         " --sample_length?"
+        #     )
+        # else:
+        #     assert x_cond is None
+        #     x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
 
         x_t = x  # Target
         x = self.x_emb(x)  # X emb
@@ -2086,10 +2047,8 @@ def sample(
             xs, x = [], None
             if get_preds:
                 preds = []
-            # for sample_t in get_range(range(0, sample_tokens)):
-            total = sample_tokens - len(xs)
-            task3 = progress.add_task("Sampling indivdual tokens ", total=total)
-            for sample_t in range(0, sample_tokens):
+
+            for sample_t in get_range(range(0, sample_tokens)):
 
                 x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
@@ -2108,9 +2067,6 @@ def sample(
                 x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
                 assert x.shape == (n_samples, 1)
                 xs.append(x.clone())
-                progress.update(task3, advance=1)
-                progress.update(0, advance=1 / total)
-
             del x
             self.transformer.del_cache()
 
@@ -2182,9 +2138,8 @@ def primed_sample(
             x_primes = []
             start = 0
             x = None
-            # for current_chunk_size in get_range(chunk_sizes):
-            task2 = progress.add_task("Primed Sampling chunks ", total=len(chunk_sizes))
-            for current_chunk_size in chunk_sizes:
+
+            for current_chunk_size in get_range(chunk_sizes):
 
                 xs_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
@@ -2212,8 +2167,6 @@ def primed_sample(
                 else:
                     del x_prime
 
-                progress.update(task2, advance=1)
-
             if get_preds:
                 x_prime = torch.cat(x_primes, dim=1)
                 assert x_prime.shape == (n_samples, len(xs), self.width)
@@ -2226,10 +2179,8 @@ def primed_sample(
             x = xs[-1]
             assert x.shape == (n_samples, 1)
             empty_cache()
-            # for sample_t in get_range(range(len(xs), sample_tokens)):
-            total = sample_tokens - len(xs)
-            task3 = progress.add_task("Sampling indivdual tokens ", total=total)
-            for sample_t in range(len(xs), sample_tokens):
+
+            for sample_t in get_range(range(len(xs), sample_tokens)):
                 x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16)  # Transformer
@@ -2245,8 +2196,6 @@ def primed_sample(
                 x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
                 assert x.shape == (n_samples, 1)
                 xs.append(x.clone())
-                progress.update(task3, advance=1)
-                progress.update(0, advance=1 / total)
 
             del x
             self.transformer.del_cache()
@@ -2588,7 +2537,6 @@ def rescale(z_shape):
         self.z_shape = self.z_shapes[level]
 
         self.level = level
-        assert level < self.levels, f"Total levels {self.levels}, got level {level}"
 
         self.l_bins = config.l_bins
 
@@ -2745,30 +2693,24 @@ def conditioner_block(_level):
         self.cond_downsample = self.downsamples[level + 1] if level != self.levels - 1 else None
         self.raw_to_tokens = np.prod(self.downsamples[: level + 1])
         self.sample_length = self.n_ctx * self.raw_to_tokens
-        # if the labels are used for training, the trainer will use it?
-        # This is probably were its gonna get a bit complicated
-
-        # if labels:
-        #     self.labels_v3 = labels_v3
-        #     self.labeller = Labeller(self.y_emb.max_bow_genre_size, self.n_tokens, self.sample_length, v3=self.labels_v3)
-        # else:
-        #     self.labeller = EmptyLabeller()
 
         print(
             f"Level:{level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample"
             f" length:{self.sample_length}"
         )
 
-    def get_y(self, labels, start, get_indices=False):
-        y = labels["y"].clone()
+    def get_y(self, labels, start, total_length, get_indices=False):
+        y = labels["input_ids"].clone()
         # y = labels.clone()
-
+        y[:, 0] = total_length
         # Set sample_length to match this level
         y[:, 2] = int(self.sample_length)
 
         # Set offset
         y[:, 1:2] = y[:, 1:2] + int(start * self.raw_to_tokens)
-        indices = self.set_y_lyric_tokens(y, labels)
+        # here since y has the full token_list, ze just need to selected the ones that are relevant
+
+        y, indices = self.set_y_lyric_tokens(y, labels)
         if get_indices:
             return y, indices
         else:
@@ -2781,13 +2723,18 @@ def set_y_lyric_tokens(self, ys, labels):
             tokens_list = []
             indices_list = []  # whats the index of each current character in original array
             for i in range(ys.shape[0]):
-                full_tokens = labels["full_tokens"]
+                full_tokens = labels["input_ids"]
                 total_length, offset, duration = ys[i, 0], ys[i, 1], ys[i, 2]
                 tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
                 tokens_list.append(tokens)
                 indices_list.append(indices)
             ys[:, -self.n_tokens :] = torch.tensor(tokens_list, dtype=torch.long, device="cpu")
-            return indices_list
+            return [
+                total_length,
+                offset,
+                duration,
+                torch.tensor(tokens_list, dtype=torch.long, device="cpu"),
+            ], indices_list
         else:
             return None
 
@@ -2911,9 +2858,6 @@ def sample(
                 assert z_cond.shape[0] == N, f"Expected shape ({N},**), got shape {z_cond.shape}"
 
         no_past_context = z is None or z.shape[1] == 0
-        # if dist.get_rank() == 0:
-        #     name = {True: 'Ancestral', False: 'Primed'}[no_past_context]
-        #     print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
         print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
 
@@ -3061,42 +3005,40 @@ class JukeboxPreTrainedModel(PreTrainedModel):
 
     config_class = JukeboxConfig
     base_model_prefix = "transformer"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, (nn.Linear, Conv1D)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-        # Reinitialize selected weights subject to the Jukebox Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if "c_proj" in name and "weight" in name:
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, JukeboxModel):
-            module.gradient_checkpointing = value
+    # def _init_weights(self, module):
+    #     """Initialize the weights."""
+    #     if isinstance(module, (nn.Linear, Conv1D)):
+    #         # Slightly different from the TF version which uses truncated_normal for initialization
+    #         # cf https://github.com/pytorch/pytorch/pull/5617
+    #         module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+    #         if module.bias is not None:
+    #             module.bias.data.zero_()
+    #     elif isinstance(module, nn.Embedding):
+    #         module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+    #         if module.padding_idx is not None:
+    #             module.weight.data[module.padding_idx].zero_()
+    #     elif isinstance(module, nn.LayerNorm):
+    #         module.bias.data.zero_()
+    #         module.weight.data.fill_(1.0)
+
+    #     # Reinitialize selected weights subject to the Jukebox Paper Scheme:
+    #     #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+    #     #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+    #     #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+    #     #
+    #     # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+    #     for name, p in module.named_parameters():
+    #         if "c_proj" in name and "weight" in name:
+    #             # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+    #             p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
+
+    # def _set_gradient_checkpointing(self, module, value=False):
+    #     if isinstance(module, JukeboxModel):
+    #         module.gradient_checkpointing = value
 
 
 JUKEBOX_START_DOCSTRING = r"""
@@ -3172,7 +3114,7 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
         end = start + n_ctx
 
         # set y offset, sample_length and lyrics tokens
-        y, indices_hop = prior.get_y(labels, start, get_indices=True)
+        y, indices_hop = prior.get_y(labels, start, total_length, get_indices=True)
         # assert len(indices_hop) == bs
         for indices in indices_hop:
             assert len(indices) == n_tokens
@@ -3204,7 +3146,7 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
     alignments = []
     for item in range(bs):
         # Note each item has different length lyrics
-        full_tokens = labels["info"][item]["full_tokens"]
+        full_tokens = labels["input_ids"][:, 3:]
         alignment = np.zeros((total_length, len(full_tokens) + 1))
         for start in reversed(get_starts(total_length, n_ctx, hop_length)):
             end = start + n_ctx
@@ -3218,20 +3160,6 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
     return alignments
 
 
-from rich.live import Live
-from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn
-
-
-progress = Progress(
-    "{task.description}",
-    SpinnerColumn(),
-    BarColumn(),
-    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
-    TimeElapsedColumn(),
-    TimeRemainingColumn(),
-)
-
-
 @add_start_docstrings(
     "The bare JUKEBOX Model from which you can sample",
     JUKEBOX_START_DOCSTRING,
@@ -3248,14 +3176,6 @@ def __init__(self, config):
         config.vqvae_z_shapes = self.vqvae.z_shapes
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
     # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
     def sample_partial_window(self, zs, labels, sampling_kwargs, level, tokens_to_sample, hps):
         prior = self.priors[level]
@@ -3277,7 +3197,9 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         n_samples = hps.n_samples
         n_ctx = prior.n_ctx
         end = start + n_ctx
-
+        total_length = sampling_kwargs.totat_length  # this is new, but makes way more sens than having it inside
+        # the tokenizer, as [total_length, offset, sample_length] can be written on the fly and changed without changing the
+        # lyric tokens.
         # get z already sampled at current level
         z = zs[level][:, start:end]
 
@@ -3302,7 +3224,7 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         # if there are no levels above should return None!
 
         # set y offset, sample_length and lyrics okens
-        y = prior.get_y(labels, start)
+        y = prior.get_y(labels, start, total_length)
 
         empty_cache()
         max_batch_size = 2
@@ -3331,12 +3253,8 @@ def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_len
         print(f"Sampling level {level}")
         prior = self.priors[level]
         if total_length >= prior.n_ctx:
-            with Live(progress):
-                progress.add_task(
-                    "[red]Sampling single window...", total=len(get_starts(total_length, prior.n_ctx, hop_length))
-                )
-                for start in get_starts(total_length, prior.n_ctx, hop_length):
-                    zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
+            for start in get_range(get_starts(total_length, prior.n_ctx, hop_length)):
+                zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
 
         else:
             zs = self.sample_partial_window(zs, labels, sampling_kwargs, level, total_length, hps)
@@ -3359,13 +3277,6 @@ def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
 
             zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
 
-            # TODO either mask them or ddo better
-            # if level != len(sample_levels) - 1:
-            #     labels_level = labels[level][0][: 4 + hps.max_bow_genre_size].unsqueeze(0)
-            #     zs = self.sample_level(zs, labels_level, sampling_kwargs[level], level, total_length, hop_length, hps)
-            # else:
-            #     zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
-
             prior.to(zs[0].device)
             empty_cache()
 
@@ -3373,10 +3284,6 @@ def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
             with torch.no_grad():
                 x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
 
-            # if dist.get_world_size() > 1:
-            #     logdir = f"{hps.name}_rank_{dist.get_rank()}/level_{level}"
-            # else:
-            #     logdir = f"{hps.name}/level_{level}"
             logdir = f"{hps.name}/level_{level}"
             if not os.path.exists(logdir):
                 os.makedirs(logdir)
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 4c563020e293f..3a8530dd3d54d 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -41,7 +41,6 @@ class JukeboxModelTest(unittest.TestCase):
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
-        offset=0,
         lyrics="""I met a traveller from an antique land,
     Who said "Two vast and trunkless legs of stone
     Stand in the desert. . . . Near them, on the sand,
@@ -58,7 +57,9 @@ class JukeboxModelTest(unittest.TestCase):
     The lone and level sands stretch far away
     """,
     )
+
     # @slow
+
     def test_model(self):
         set_seed(0)
 
@@ -573,35 +574,45 @@ def test_conditioning(self):
 
     def prepare_inputs(self, model, model_id, chunk_size=32):
         tokenizer = JukeboxTokenizer.from_pretrained(model_id)
+        top_prior = model.priors[-1]
         # create sampling parameters
         sampling_temperature = 0.98
         lower_batch_size = 16
         max_batch_size = 16
+        sample_length_in_seconds = 24
         sampling_kwargs = [
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=chunk_size, sample_tokens=10),
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=chunk_size, sample_tokens=10),
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
             dict(
                 temp=sampling_temperature,
                 fp16=False,
                 max_batch_size=max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
             ),
         ]
 
-        sample_length_in_seconds = 24
-        top_prior = model.priors[-1]
-        total_length = (
-            int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens
-        ) * top_prior.raw_to_tokens
-        tokens = tokenizer(**self.metas, sample_length=top_prior.sample_length, total_length=total_length)
+        tokens = tokenizer(**self.metas)
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-
-        labels = [inputs.copy() for i in range(3)]
-        labels[1]["y"] = labels[1]["y"][:, : (4 + tokenizer.n_genres)]
-        labels[0]["y"] = labels[0]["y"][:, : (4 + tokenizer.n_genres)]
-
-        return labels, sampling_kwargs
+        return inputs, sampling_kwargs
 
     # @slow
     def test_1b_lyrics(self):
@@ -892,4 +903,4 @@ def test_5b_lyrics(self):
 if __name__ == "__main__":
     tester = JukeboxModelTest()
     tester.test_1b_lyrics()
-    tester.test_5b_lyrics()
+    # tester.test_5b_lyrics()

From 3ff1468eb9332b068f94a475ce0f528d0dbb86a7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Fri, 8 Jul 2022 06:44:50 +0000
Subject: [PATCH 025/196] fix 1b and 5b, refactor tokenizer. Both models are
 ready

---
 .../models/jukebox/configuration_jukebox.py   |   2 +-
 .../models/jukebox/modeling_jukebox.py        | 121 ++--
 .../models/jukebox/tokenization_jukebox.py    |  18 +-
 tests/models/jukebox/test_modeling_jukebox.py | 528 ++++++++----------
 4 files changed, 302 insertions(+), 367 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 849db231c5707..9932879d23d18 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -198,7 +198,7 @@ def __init__(
         cond_dilation_growth_rate=[1, 3, 3],
         cond_dilation_cycle=[None, 8, 8],
         cond_c_res=[0, 1, 1],
-        cond_res_scale=False,
+        cond_res_scale=[None,True, False],
         prime_width=[128, 128, 128],
         prime_depth=[18, 3, 3],
         prime_cond_c_res=[0, 1, 1],
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index cbb6ace2ee0ba..869dea69a698c 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -61,7 +61,7 @@ def empty_cache():
 
 import sys
 
-import tqdm
+from tqdm import tqdm
 
 
 def get_range(x):
@@ -186,13 +186,19 @@ def _get_depth(depth):
             )
             for depth in range(n_depth)
         ]
+        self.checkpoint_res = checkpoint_res
         if reverse_dilation:
             blocks = blocks[::-1]
-
-        # TODO remvove the sequential in favor of a more understanble code
-        self.model = nn.Sequential(*blocks)
+        if self.checkpoint_res == 1:
+            self.blocks = nn.ModuleList(blocks)
+        else:
+            self.model = nn.Sequential(*blocks)
 
     def forward(self, x):
+        if self.checkpoint_res == 1:
+            for block in self.blocks:
+                x = block(x)
+            return x
         return self.model(x)
 
 
@@ -255,20 +261,8 @@ def __init__(
             blocks.append(block)
             for i in range(down_t):
                 block = nn.Sequential(
-                    Resnet1D(
-                        width,
-                        depth,
-                        m_conv,
-                        dilation_growth_rate,
-                        dilation_cycle,
-                        zero_out=zero_out,
-                        res_scale=res_scale,
-                        reverse_dilation=reverse_decoder_dilation,
-                        checkpoint_res=checkpoint_res,
-                    ),
-                    nn.ConvTranspose1d(
-                        width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t
-                    ),
+                    Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out=zero_out, res_scale=res_scale, reverse_dilation=reverse_decoder_dilation, checkpoint_res=checkpoint_res),
+                    nn.ConvTranspose1d(width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t)
                 )
                 blocks.append(block)
         self.model = nn.Sequential(*blocks)
@@ -1924,25 +1918,25 @@ def forward(
         # assert isinstance(x, torch.cuda.LongTensor)
         # assert (0 <= x).all() and (x < self.bins).all()
 
-        # if self.y_cond:
-        #     assert y_cond is not None
-        #     assert y_cond.shape == (N, 1, self.width)
-        # else:
-        #     assert y_cond is None
-
-        # if self.x_cond:
-        #     assert x_cond is not None
-        #     assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
-        #         N,
-        #         1,
-        #         self.width,
-        #     ), (
-        #         f"{x_cond.shape} != {(N, D, self.width)} nor {(N, 1, self.width)}. Did you pass the correct"
-        #         " --sample_length?"
-        #     )
-        # else:
-        #     assert x_cond is None
-        #     x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
+        if self.y_cond:
+            assert y_cond is not None
+            assert y_cond.shape == (N, 1, self.width)
+        else:
+            assert y_cond is None
+
+        if self.x_cond:
+            assert x_cond is not None
+            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
+                N,
+                1,
+                self.width,
+            ), (
+                f"{x_cond.shape} != {(N, D, self.width)} nor {(N, 1, self.width)}. Did you pass the correct"
+                " --sample_length?"
+            )
+        else:
+            assert x_cond is None
+            x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
 
         x_t = x  # Target
         x = self.x_emb(x)  # X emb
@@ -2339,17 +2333,11 @@ def postprocess(self, x):
         return x
 
     def forward(self, x, x_cond=None):
-        # N = x.shape[0]
-        # assert_shape(x, (N, *self.x_shape))
-        if x_cond is not None:
-            # assert_shape(x_cond, (N, *self.x_shape, self.width))
-            pass
-        else:
-            x_cond = 0.0
+        if x_cond is None:
+           x_cond = 0.0
         # Embed x
         x = x.long()
         x = self.x_emb(x)
-        # assert_shape(x, (N, *self.x_shape, self.width))
         x = x + x_cond
 
         # Run conditioner
@@ -2590,7 +2578,7 @@ def rescale(z_shape):
             dilation_growth_rate=config.cond_dilation_growth_rate[-level - 1],
             dilation_cycle=config.cond_dilation_cycle[-level - 1],
             zero_out=config.cond_zero_out,
-            res_scale=config.cond_res_scale,
+            res_scale=config.cond_res_scale[-level - 1],
             checkpoint_res=config.cond_c_res[-level - 1],
         )  # have to keep this else names wrong
 
@@ -2700,7 +2688,7 @@ def conditioner_block(_level):
         )
 
     def get_y(self, labels, start, total_length, get_indices=False):
-        y = labels["input_ids"].clone()
+        y = labels.clone()
         # y = labels.clone()
         y[:, 0] = total_length
         # Set sample_length to match this level
@@ -2710,33 +2698,30 @@ def get_y(self, labels, start, total_length, get_indices=False):
         y[:, 1:2] = y[:, 1:2] + int(start * self.raw_to_tokens)
         # here since y has the full token_list, ze just need to selected the ones that are relevant
 
-        y, indices = self.set_y_lyric_tokens(y, labels)
+        # Set lyric tokens
+        y, indices = self.set_y_lyric_tokens(y)
         if get_indices:
             return y, indices
         else:
             return y
 
-    def set_y_lyric_tokens(self, ys, labels):
+
+    def set_y_lyric_tokens(self, labels):
         # assert ys.shape[0] == len(labels)
         if self.n_tokens > 0:
             # total_length, offset, duration):
-            tokens_list = []
+            tokens_list = torch.zeros((1,self.n_tokens),dtype=torch.long)
             indices_list = []  # whats the index of each current character in original array
-            for i in range(ys.shape[0]):
-                full_tokens = labels["input_ids"]
-                total_length, offset, duration = ys[i, 0], ys[i, 1], ys[i, 2]
+            for i in range(labels.shape[0]):
+                full_tokens = labels.clone()[:,4 + self.y_emb.max_bow_genre_size:]
+                total_length, offset, duration = labels[i, 0], labels[i, 1], labels[i, 2]
                 tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
-                tokens_list.append(tokens)
+                tokens_list[i,:] = tokens
                 indices_list.append(indices)
-            ys[:, -self.n_tokens :] = torch.tensor(tokens_list, dtype=torch.long, device="cpu")
-            return [
-                total_length,
-                offset,
-                duration,
-                torch.tensor(tokens_list, dtype=torch.long, device="cpu"),
-            ], indices_list
+                
+            return torch.cat((labels[:, :4 + self.y_emb.max_bow_genre_size],tokens_list),dim=-1), indices_list
         else:
-            return None
+            return labels, None
 
     def get_z_conds(self, zs, start, end):
         if self.level != self.levels - 1:
@@ -3146,7 +3131,7 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
     alignments = []
     for item in range(bs):
         # Note each item has different length lyrics
-        full_tokens = labels["input_ids"][:, 3:]
+        full_tokens = labels[:, 3:]
         alignment = np.zeros((total_length, len(full_tokens) + 1))
         for start in reversed(get_starts(total_length, n_ctx, hop_length)):
             end = start + n_ctx
@@ -3197,7 +3182,7 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         n_samples = hps.n_samples
         n_ctx = prior.n_ctx
         end = start + n_ctx
-        total_length = sampling_kwargs.totat_length  # this is new, but makes way more sens than having it inside
+        
         # the tokenizer, as [total_length, offset, sample_length] can be written on the fly and changed without changing the
         # lyric tokens.
         # get z already sampled at current level
@@ -3224,7 +3209,7 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         # if there are no levels above should return None!
 
         # set y offset, sample_length and lyrics okens
-        y = prior.get_y(labels, start, total_length)
+        y = prior.get_y(labels, start, self.total_length)
 
         empty_cache()
         max_batch_size = 2
@@ -3262,10 +3247,12 @@ def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_len
 
     # Sample multiple levels
     def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
+        
         alignments = None
         for level in reversed(sample_levels):
+            self.total_length = sampling_kwargs[level].pop('total_length') 
             prior = self.priors[level]
-            prior = prior.to(zs[0].device)
+            prior = prior.to(zs[0].device).eval()
             empty_cache()
 
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
@@ -3277,7 +3264,7 @@ def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
 
             zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
 
-            prior.to(zs[0].device)
+            prior.to(zs[-1].device)
             empty_cache()
 
             # Decode sample
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index c9c484ade524a..9b5ff4a0c0f07 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -73,6 +73,7 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
             Expected duration of the generated music, in samples. The duration has to be smaller than the total lenght,
             which represent the overall length of the signal,
     """
+    full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
         tokens = [0] * (max_n_lyric_tokens - len(full_tokens)) + full_tokens
         indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
@@ -84,8 +85,8 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
         indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
     assert len(tokens) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(tokens)}"
     assert len(indices) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(indices)}"
-    assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
-    return tokens, indices
+    # assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
+    return tokens.unsqueeze(dim=0), indices
 
 
 class JukeboxTokenizer(PreTrainedTokenizer):
@@ -283,7 +284,7 @@ def prepare_for_tokenization(
                     "_"
                 )  # split is for the full dictionnary with combined genres
 
-        if self.version[idx] == "v3":
+        if self.version[idx] == "v2":
             self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
             vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
             self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
@@ -352,14 +353,15 @@ def __call__(self, artist, genres, lyrics, return_tensor="pt"):
 
         attention_masks = [-INFINITY] * len(full_tokens[-1])
         # TODO properly handle the return pt tensor option
+        input_ids = [torch.tensor([input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]])for i in range(len(self.version))]
         if return_tensor == "pt":
-            return [
-                {
-                    "input_ids": input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i],
+            # TODO use BatchEncoding to support
+            
+            return {
+                    "input_ids": input_ids,
                     "attention_masks": torch.tensor(attention_masks),
                 }
-                for i in range(len(self.version))
-            ]
+                
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 3a8530dd3d54d..d0bc228f5fedf 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -34,6 +34,238 @@
     from transformers import JukeboxModel, JukeboxTokenizer  # ,JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
+@require_torch
+class Jukebox1bModelTester(unittest.TestCase):
+    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
+    metas = dict(
+        artist="Zac Brown Band",
+        genres="Country",
+        lyrics="""I met a traveller from an antique land,
+    Who said "Two vast and trunkless legs of stone
+    Stand in the desert. . . . Near them, on the sand,
+    Half sunk a shattered visage lies, whose frown,
+    And wrinkled lip, and sneer of cold command,
+    Tell that its sculptor well those passions read
+    Which yet survive, stamped on these lifeless things,
+    The hand that mocked them, and the heart that fed;
+    And on the pedestal, these words appear:
+    My name is Ozymandias, King of Kings;
+    Look on my Works, ye Mighty, and despair!
+    Nothing beside remains. Round the decay
+    Of that colossal Wreck, boundless and bare
+    The lone and level sands stretch far away
+    """,
+    )
+    
+    EXPECTED_OUTPUTS_2 = torch.tensor([1864, 1536, 1213, 1869, 1321, 1597,  519,  947, 1177,  789, 1434,  653,
+         653,  653,  653,  653,  653,  653,  653,  653, 1007, 1472,  255, 1228,
+         555, 1272, 1379, 1423, 1673,  427, 1683, 1321,  475,  416, 1177, 1827,
+        1106, 1127, 1494,  812])
+    EXPECTED_OUTPUT_1 = torch.tensor([1125, 1585, 1485, 2020, 1141, 1680,  381,  539, 1368,  642, 1585,  284,
+         717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
+         593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
+        1749, 1803, 1440, 1145,  416,  416, 1372, 1079, 1045, 1320, 1764,  158,
+        2020, 1543, 2037,  416,  539, 2047, 1446,  885, 1749, 2047,  118, 1348,
+        1585,  284,  529, 2047, 1228,  556,  732, 2047,  307, 1323, 2037, 1446,
+         591, 1803,   58,  591,  529, 1079,  642,  591]
+        )
+    EXPECTED_OUTPUT_0 =  torch.tensor(
+            [1979, 1613,  290, 1843,  844, 1427,  293,  616, 1771,  632,  591,  290,
+         234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
+         290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
+         854, 1120, 1572,  719]
+        )
+    
+
+
+    def prepare_inputs(self, model, model_id, chunk_size=32):
+        tokenizer = JukeboxTokenizer.from_pretrained(model_id)
+        top_prior = model.priors[-1]
+        # create sampling parameters
+        sampling_temperature = 0.98
+        lower_batch_size = 16
+        max_batch_size = 16
+        sample_length_in_seconds = 24
+        sampling_kwargs = [
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+            dict(
+                temp=sampling_temperature,
+                fp16=False,
+                max_batch_size=max_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+        ]
+
+        tokens = tokenizer(**self.metas)['input_ids']
+        return tokens, sampling_kwargs
+
+    def test_sampling(self):
+        model_id = "ArthurZ/jukebox-1b-lyrics"
+        model = JukeboxModel.from_pretrained(model_id,cond_res_scale=[None,True, False] ).eval()
+
+        labels, sampling_kwargs = self.prepare_inputs(model, model_id)
+        set_seed(0)
+        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
+        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
+
+        zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
+        set_seed(0)
+        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
+        zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
+        assert torch.allclose(zs[-2][0, :40], self.EXPECTED_OUTPUT_1)
+    
+
+        zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
+
+        set_seed(0)
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
+        zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
+        assert torch.allclose(zs[0][0, :40], self.EXPECTED_OUTPUT_0)
+
+    def test_vqvae(self):
+        # implemented vavae decoding test at 3 levels using the expected outputs
+        pass
+
+    
+
+@require_torch
+class Jukebox5bModelTester(unittest.TestCase):
+    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
+    metas = dict(
+        artist="Zac Brown Band",
+        genres="Country",
+        lyrics="""I met a traveller from an antique land,
+    Who said "Two vast and trunkless legs of stone
+    Stand in the desert. . . . Near them, on the sand,
+    Half sunk a shattered visage lies, whose frown,
+    And wrinkled lip, and sneer of cold command,
+    Tell that its sculptor well those passions read
+    Which yet survive, stamped on these lifeless things,
+    The hand that mocked them, and the heart that fed;
+    And on the pedestal, these words appear:
+    My name is Ozymandias, King of Kings;
+    Look on my Works, ye Mighty, and despair!
+    Nothing beside remains. Round the decay
+    Of that colossal Wreck, boundless and bare
+    The lone and level sands stretch far away
+    """,
+    )
+    
+    EXPECTED_OUTPUT_2 = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653, 1489,  653,
+          653,  653,  653,  653,  653,  653,  653,  653])
+
+    EXPECTED_OUTPUT_1 = torch.tensor([1125,  416, 1125, 1125, 1125, 1125,  416,  416,  416,  416, 1585,  284,
+         717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
+         593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
+        1749, 1803, 1440, 1145,  416,  416, 1372, 1079, 1045, 1320, 1764,  158,
+        2020, 1543, 2037,  416,  539, 2047, 1446,  885, 1749, 2047,  118, 1348,
+        1585,  284,  529, 2047, 1228,  556,  732, 2047,  307, 1323, 2037, 1446,
+         591, 1803,   58,  591,  529, 1079,  642,  591]
+        )
+    EXPECTED_OUTPUT_0 =  torch.tensor([1755, 1061,  234, 1755,  290, 1572,  234,  491,  992,  417,  591,  290,
+         234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
+         290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
+         854, 1120, 1572,  719,  491,   34,  755,  632,  844,  755, 1802,  225,
+        2013, 1814, 1148,  616,  185, 1979, 1460,  983, 1168, 1613,   34, 1242,
+         632,   34,   34, 1982, 1510,  554,  983, 1784,  526, 1691, 1268, 1268,
+         290,  755,   34,  307,  222,  234,  648,  526
+         ])
+    
+
+
+    def prepare_inputs(self, model, model_id, chunk_size=32):
+        tokenizer = JukeboxTokenizer.from_pretrained(model_id)
+        top_prior = model.priors[-1]
+        # create sampling parameters
+        sampling_temperature = 0.98
+        lower_batch_size = 16
+        max_batch_size = 16
+        sample_length_in_seconds = 24
+        sampling_kwargs = [
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+            dict(
+                temp=sampling_temperature,
+                fp16=False,
+                max_batch_size=max_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=10,
+                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+        ]
+
+        tokens = tokenizer(**self.metas)['input_ids']
+        return tokens, sampling_kwargs
+
+    def test_sampling(self):
+        model_id = "ArthurZ/jukebox-5b-lyrics"
+        model = JukeboxModel.from_pretrained(model_id,cond_res_scale=[None,True, False] ).eval()
+
+        labels, sampling_kwargs = self.prepare_inputs(model, model_id, chunk_size=32)
+        set_seed(0)
+        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
+        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
+
+        zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
+        set_seed(0)
+        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
+        zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
+        assert torch.allclose(zs[-2][0, :80], self.EXPECTED_OUTPUT_1)
+    
+
+        zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
+
+        set_seed(0)
+        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
+        zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
+        assert torch.allclose(zs[0][0, :80], self.EXPECTED_OUTPUT_0)
+
+    def test_vqvae(self):
+        # implemented vavae decoding test at 3 levels using the expected outputs
+        pass
+
+    
+
+
 @require_torch
 class JukeboxModelTest(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
@@ -610,297 +842,11 @@ def prepare_inputs(self, model, model_id, chunk_size=32):
             ),
         ]
 
-        tokens = tokenizer(**self.metas)
-        inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-        return inputs, sampling_kwargs
-
-    # @slow
-    def test_1b_lyrics(self):
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.enabled = False
-
-        model_id = "ArthurZ/jukebox-1b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id).eval()
-
-        labels, sampling_kwargs = self.prepare_inputs(model, model_id)
-
-        set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
-
-        # TODO generate the original outputs
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                1864,
-                1536,
-                1213,
-                1869,
-                1321,
-                1597,
-                519,
-                947,
-                1177,
-                789,
-                1434,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                1007,
-                1472,
-                255,
-                1228,
-                555,
-                1272,
-                1379,
-                1423,
-                1673,
-                427,
-                1683,
-                1321,
-                475,
-                416,
-                1177,
-                1827,
-                1106,
-                1127,
-                1494,
-                812,
-            ]
-        )
-        assert torch.allclose(zs[-1][0], EXPECTED_OUTPUT)
-
-        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1)
-        zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
-        # TODO find the expected outputs
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                904,
-                2037,
-                343,
-                1372,
-                135,
-                717,
-                506,
-                157,
-                307,
-                1419,
-                1751,
-                343,
-                899,
-                1803,
-                573,
-                94,
-                1046,
-                1014,
-                684,
-                869,
-                2037,
-                1125,
-                1004,
-                1658,
-                1181,
-                37,
-                1749,
-                2047,
-                1426,
-                1348,
-                2037,
-                1125,
-                1004,
-                1544,
-                573,
-                885,
-                1749,
-                1803,
-                1426,
-                1348,
-            ]
-        )
-        assert torch.allclose(zs[-2][0, :40], EXPECTED_OUTPUT)
-
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1)
-        zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
-        # TODO find the expected outputs
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                904,
-                2037,
-                343,
-                1372,
-                135,
-                717,
-                506,
-                157,
-                307,
-                1419,
-                1751,
-                343,
-                899,
-                1803,
-                573,
-                94,
-                1046,
-                1014,
-                684,
-                869,
-                2037,
-                1125,
-                1004,
-                1658,
-                1181,
-                37,
-                1749,
-                2047,
-                1426,
-                1348,
-                2037,
-                1125,
-                1004,
-                1544,
-                573,
-                885,
-                1749,
-                1803,
-                1426,
-                1348,
-            ]
-        )
-        assert torch.allclose(zs[0][0, :40], EXPECTED_OUTPUT)
-
-    def test_5b_lyrics(self):
-        set_seed(0)
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.enabled = False
-
-        model_id = "ArthurZ/jukebox-5b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id).eval()
-
-        labels, sampling_kwargs = self.prepare_inputs(model_id, model.priors[-1].sample_length, chunk_size=16)
-
-        zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(len(model.priors))]
-        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                1489,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                1434,
-                1434,
-                653,
-                1357,
-                653,
-                1434,
-                1434,
-                1536,
-                1599,
-                710,
-            ]
-        )
-        assert torch.allclose(zs[-1][0, :30], EXPECTED_OUTPUT)
-
-        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 2048 - zs[-1].shape[-1]).cpu()), dim=-1)
-        zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
-        # TODO find the expected outputs
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                1489,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                1434,
-                1434,
-                653,
-                1357,
-                653,
-                1434,
-                1434,
-                1536,
-                1599,
-                710,
-            ]
-        )
-        assert torch.allclose(zs[-2][0, :30], EXPECTED_OUTPUT)
-
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 4096 - zs[-2].shape[-1]).cpu()), dim=-1)
-        zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
-        # TODO find the expected outputs
-        EXPECTED_OUTPUT = torch.tensor(
-            [
-                1489,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                653,
-                1434,
-                1434,
-                653,
-                1357,
-                653,
-                1434,
-                1434,
-                1536,
-                1599,
-                710,
-            ]
-        )
-        assert torch.allclose(zs[0][0, :30], EXPECTED_OUTPUT)
+        tokens = tokenizer(**self.metas)['input_ids']
+        return tokens, sampling_kwargs
 
 
 if __name__ == "__main__":
-    tester = JukeboxModelTest()
-    tester.test_1b_lyrics()
-    # tester.test_5b_lyrics()
+    tester = Jukebox5bModelTester()
+    # tester.test_1b_lyrics()
+    tester.test_sampling()

From 58cb7bd5219a1199655d23e6a0bc3e93bdf3cb71 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Fri, 8 Jul 2022 07:17:43 +0000
Subject: [PATCH 026/196] Add slow GPU tests that needs to be done later on

---
 tests/models/jukebox/test_modeling_jukebox.py | 31 +++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index d0bc228f5fedf..e3bfe302a8b79 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from transformers import JukeboxConfig, is_torch_available
-from transformers.testing_utils import require_torch
+from transformers.testing_utils import require_torch, slow
 from transformers.trainer_utils import set_seed
 
 
@@ -143,6 +143,19 @@ def test_sampling(self):
         zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
         assert torch.allclose(zs[0][0, :40], self.EXPECTED_OUTPUT_0)
 
+        
+    @slow
+    def test_slow_sampling(self):
+
+        model_id = "ArthurZ/jukebox-1b-lyrics"
+        model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
+
+        labels, sampling_kwargs = self.prepare_inputs(model, model_id, chunk_size=32)
+        set_seed(0)
+        zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
+
     def test_vqvae(self):
         # implemented vavae decoding test at 3 levels using the expected outputs
         pass
@@ -237,7 +250,7 @@ def prepare_inputs(self, model, model_id, chunk_size=32):
 
     def test_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id,cond_res_scale=[None,True, False] ).eval()
+        model = JukeboxModel.from_pretrained(model_id).eval()
 
         labels, sampling_kwargs = self.prepare_inputs(model, model_id, chunk_size=32)
         set_seed(0)
@@ -258,6 +271,18 @@ def test_sampling(self):
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
         assert torch.allclose(zs[0][0, :80], self.EXPECTED_OUTPUT_0)
+    
+    @slow
+    def test_slow_sampling(self):
+
+        model_id = "ArthurZ/jukebox-5b-lyrics"
+        model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
+
+        labels, sampling_kwargs = self.prepare_inputs(model, model_id, chunk_size=32)
+        set_seed(0)
+        zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
     def test_vqvae(self):
         # implemented vavae decoding test at 3 levels using the expected outputs
@@ -849,4 +874,4 @@ def prepare_inputs(self, model, model_id, chunk_size=32):
 if __name__ == "__main__":
     tester = Jukebox5bModelTester()
     # tester.test_1b_lyrics()
-    tester.test_sampling()
+    tester.test_slow_sampling()

From 4c90a40cfbbb64df050a3ab906965c126197d847 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Fri, 8 Jul 2022 07:18:51 +0000
Subject: [PATCH 027/196] style

---
 .../models/jukebox/configuration_jukebox.py   |  2 +-
 .../models/jukebox/modeling_jukebox.py        | 35 ++++++++++++-------
 .../models/jukebox/tokenization_jukebox.py    | 14 ++++----
 3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 9932879d23d18..33423f6e934ab 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -198,7 +198,7 @@ def __init__(
         cond_dilation_growth_rate=[1, 3, 3],
         cond_dilation_cycle=[None, 8, 8],
         cond_c_res=[0, 1, 1],
-        cond_res_scale=[None,True, False],
+        cond_res_scale=[None, True, False],
         prime_width=[128, 128, 128],
         prime_depth=[18, 3, 3],
         prime_cond_c_res=[0, 1, 1],
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 869dea69a698c..b9a53eb69e410 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -261,8 +261,20 @@ def __init__(
             blocks.append(block)
             for i in range(down_t):
                 block = nn.Sequential(
-                    Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out=zero_out, res_scale=res_scale, reverse_dilation=reverse_decoder_dilation, checkpoint_res=checkpoint_res),
-                    nn.ConvTranspose1d(width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t)
+                    Resnet1D(
+                        width,
+                        depth,
+                        m_conv,
+                        dilation_growth_rate,
+                        dilation_cycle,
+                        zero_out=zero_out,
+                        res_scale=res_scale,
+                        reverse_dilation=reverse_decoder_dilation,
+                        checkpoint_res=checkpoint_res,
+                    ),
+                    nn.ConvTranspose1d(
+                        width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t
+                    ),
                 )
                 blocks.append(block)
         self.model = nn.Sequential(*blocks)
@@ -2334,7 +2346,7 @@ def postprocess(self, x):
 
     def forward(self, x, x_cond=None):
         if x_cond is None:
-           x_cond = 0.0
+            x_cond = 0.0
         # Embed x
         x = x.long()
         x = self.x_emb(x)
@@ -2705,21 +2717,20 @@ def get_y(self, labels, start, total_length, get_indices=False):
         else:
             return y
 
-
     def set_y_lyric_tokens(self, labels):
         # assert ys.shape[0] == len(labels)
         if self.n_tokens > 0:
             # total_length, offset, duration):
-            tokens_list = torch.zeros((1,self.n_tokens),dtype=torch.long)
+            tokens_list = torch.zeros((1, self.n_tokens), dtype=torch.long)
             indices_list = []  # whats the index of each current character in original array
             for i in range(labels.shape[0]):
-                full_tokens = labels.clone()[:,4 + self.y_emb.max_bow_genre_size:]
+                full_tokens = labels.clone()[:, 4 + self.y_emb.max_bow_genre_size :]
                 total_length, offset, duration = labels[i, 0], labels[i, 1], labels[i, 2]
                 tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
-                tokens_list[i,:] = tokens
+                tokens_list[i, :] = tokens
                 indices_list.append(indices)
-                
-            return torch.cat((labels[:, :4 + self.y_emb.max_bow_genre_size],tokens_list),dim=-1), indices_list
+
+            return torch.cat((labels[:, : 4 + self.y_emb.max_bow_genre_size], tokens_list), dim=-1), indices_list
         else:
             return labels, None
 
@@ -3182,7 +3193,7 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         n_samples = hps.n_samples
         n_ctx = prior.n_ctx
         end = start + n_ctx
-        
+
         # the tokenizer, as [total_length, offset, sample_length] can be written on the fly and changed without changing the
         # lyric tokens.
         # get z already sampled at current level
@@ -3247,10 +3258,10 @@ def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_len
 
     # Sample multiple levels
     def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
-        
+
         alignments = None
         for level in reversed(sample_levels):
-            self.total_length = sampling_kwargs[level].pop('total_length') 
+            self.total_length = sampling_kwargs[level].pop("total_length")
             prior = self.priors[level]
             prior = prior.to(zs[0].device).eval()
             empty_cache()
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 9b5ff4a0c0f07..67a343a1f1298 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -353,15 +353,17 @@ def __call__(self, artist, genres, lyrics, return_tensor="pt"):
 
         attention_masks = [-INFINITY] * len(full_tokens[-1])
         # TODO properly handle the return pt tensor option
-        input_ids = [torch.tensor([input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]])for i in range(len(self.version))]
+        input_ids = [
+            torch.tensor([input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]])
+            for i in range(len(self.version))
+        ]
         if return_tensor == "pt":
             # TODO use BatchEncoding to support
-            
+
             return {
-                    "input_ids": input_ids,
-                    "attention_masks": torch.tensor(attention_masks),
-                }
-                
+                "input_ids": input_ids,
+                "attention_masks": torch.tensor(attention_masks),
+            }
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """

From 3c90c363acd3d8b7383da253db0e4b7fc4b68053 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Fri, 8 Jul 2022 07:27:23 +0000
Subject: [PATCH 028/196] style

---
 tests/models/jukebox/test_modeling_jukebox.py | 97 ++++++++++---------
 1 file changed, 51 insertions(+), 46 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index e3bfe302a8b79..d993409267e5e 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -56,27 +56,32 @@ class Jukebox1bModelTester(unittest.TestCase):
     The lone and level sands stretch far away
     """,
     )
-    
-    EXPECTED_OUTPUTS_2 = torch.tensor([1864, 1536, 1213, 1869, 1321, 1597,  519,  947, 1177,  789, 1434,  653,
-         653,  653,  653,  653,  653,  653,  653,  653, 1007, 1472,  255, 1228,
-         555, 1272, 1379, 1423, 1673,  427, 1683, 1321,  475,  416, 1177, 1827,
-        1106, 1127, 1494,  812])
-    EXPECTED_OUTPUT_1 = torch.tensor([1125, 1585, 1485, 2020, 1141, 1680,  381,  539, 1368,  642, 1585,  284,
-         717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
-         593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
+    # fmt: off
+    EXPECTED_OUTPUTS_2 = torch.tensor([
+        1864, 1536, 1213, 1869, 1321, 1597,  519,  947, 1177,  789, 1434,  653,
+        653,  653,  653,  653,  653,  653,  653,  653, 1007, 1472,  255, 1228,
+        555, 1272, 1379, 1423, 1673,  427, 1683, 1321,  475,  416, 1177, 1827,
+        1106, 1127, 1494,  812
+        ]
+    )
+    EXPECTED_OUTPUT_1 = torch.tensor([
+        1125, 1585, 1485, 2020, 1141, 1680,  381,  539, 1368,  642, 1585,  284,
+        717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
+        593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
         1749, 1803, 1440, 1145,  416,  416, 1372, 1079, 1045, 1320, 1764,  158,
         2020, 1543, 2037,  416,  539, 2047, 1446,  885, 1749, 2047,  118, 1348,
         1585,  284,  529, 2047, 1228,  556,  732, 2047,  307, 1323, 2037, 1446,
-         591, 1803,   58,  591,  529, 1079,  642,  591]
-        )
-    EXPECTED_OUTPUT_0 =  torch.tensor(
-            [1979, 1613,  290, 1843,  844, 1427,  293,  616, 1771,  632,  591,  290,
-         234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
-         290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
-         854, 1120, 1572,  719]
-        )
-    
-
+        591, 1803,   58,  591,  529, 1079,  642,  591
+        ]
+    )
+    EXPECTED_OUTPUT_0 =  torch.tensor([
+        1979, 1613,  290, 1843,  844, 1427,  293,  616, 1771,  632,  591,  290,
+        234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
+        290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
+        854, 1120, 1572,  719
+        ]
+    )
+    # fmt: on
 
     def prepare_inputs(self, model, model_id, chunk_size=32):
         tokenizer = JukeboxTokenizer.from_pretrained(model_id)
@@ -116,12 +121,12 @@ def prepare_inputs(self, model, model_id, chunk_size=32):
             ),
         ]
 
-        tokens = tokenizer(**self.metas)['input_ids']
+        tokens = tokenizer(**self.metas)["input_ids"]
         return tokens, sampling_kwargs
 
     def test_sampling(self):
         model_id = "ArthurZ/jukebox-1b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id,cond_res_scale=[None,True, False] ).eval()
+        model = JukeboxModel.from_pretrained(model_id, cond_res_scale=[None, True, False]).eval()
 
         labels, sampling_kwargs = self.prepare_inputs(model, model_id)
         set_seed(0)
@@ -134,7 +139,6 @@ def test_sampling(self):
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
         assert torch.allclose(zs[-2][0, :40], self.EXPECTED_OUTPUT_1)
-    
 
         zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
 
@@ -143,7 +147,6 @@ def test_sampling(self):
         zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
         assert torch.allclose(zs[0][0, :40], self.EXPECTED_OUTPUT_0)
 
-        
     @slow
     def test_slow_sampling(self):
 
@@ -160,7 +163,6 @@ def test_vqvae(self):
         # implemented vavae decoding test at 3 levels using the expected outputs
         pass
 
-    
 
 @require_torch
 class Jukebox5bModelTester(unittest.TestCase):
@@ -184,28 +186,34 @@ class Jukebox5bModelTester(unittest.TestCase):
     The lone and level sands stretch far away
     """,
     )
-    
-    EXPECTED_OUTPUT_2 = torch.tensor([1489,  653,  653,  653,  653,  653,  653,  653,  653,  653, 1489,  653,
-          653,  653,  653,  653,  653,  653,  653,  653])
 
-    EXPECTED_OUTPUT_1 = torch.tensor([1125,  416, 1125, 1125, 1125, 1125,  416,  416,  416,  416, 1585,  284,
-         717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
-         593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
+    # fmt: off
+    EXPECTED_OUTPUT_2 = torch.tensor([
+        1489,  653,  653,  653,  653,  653,  653,  653,  653,  653, 1489,  653,
+        653,  653,  653,  653,  653,  653,  653,  653
+        ]
+    )
+    EXPECTED_OUTPUT_1 = torch.tensor([
+        1125,  416, 1125, 1125, 1125, 1125,  416,  416,  416,  416, 1585,  284,
+        717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
+        593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
         1749, 1803, 1440, 1145,  416,  416, 1372, 1079, 1045, 1320, 1764,  158,
         2020, 1543, 2037,  416,  539, 2047, 1446,  885, 1749, 2047,  118, 1348,
         1585,  284,  529, 2047, 1228,  556,  732, 2047,  307, 1323, 2037, 1446,
-         591, 1803,   58,  591,  529, 1079,  642,  591]
-        )
-    EXPECTED_OUTPUT_0 =  torch.tensor([1755, 1061,  234, 1755,  290, 1572,  234,  491,  992,  417,  591,  290,
-         234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
-         290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
-         854, 1120, 1572,  719,  491,   34,  755,  632,  844,  755, 1802,  225,
+        591, 1803,   58,  591,  529, 1079,  642,  591
+        ]
+    )
+    EXPECTED_OUTPUT_0 =  torch.tensor([
+        1755, 1061,  234, 1755,  290, 1572,  234,  491,  992,  417,  591,  290,
+        234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
+        290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
+        854, 1120, 1572,  719,  491,   34,  755,  632,  844,  755, 1802,  225,
         2013, 1814, 1148,  616,  185, 1979, 1460,  983, 1168, 1613,   34, 1242,
-         632,   34,   34, 1982, 1510,  554,  983, 1784,  526, 1691, 1268, 1268,
-         290,  755,   34,  307,  222,  234,  648,  526
-         ])
-    
-
+        632,   34,   34, 1982, 1510,  554,  983, 1784,  526, 1691, 1268, 1268,
+        290,  755,   34,  307,  222,  234,  648,  526
+        ]
+    )
+    # fmt: on
 
     def prepare_inputs(self, model, model_id, chunk_size=32):
         tokenizer = JukeboxTokenizer.from_pretrained(model_id)
@@ -245,7 +253,7 @@ def prepare_inputs(self, model, model_id, chunk_size=32):
             ),
         ]
 
-        tokens = tokenizer(**self.metas)['input_ids']
+        tokens = tokenizer(**self.metas)["input_ids"]
         return tokens, sampling_kwargs
 
     def test_sampling(self):
@@ -263,7 +271,6 @@ def test_sampling(self):
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
         assert torch.allclose(zs[-2][0, :80], self.EXPECTED_OUTPUT_1)
-    
 
         zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
 
@@ -271,7 +278,7 @@ def test_sampling(self):
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
         assert torch.allclose(zs[0][0, :80], self.EXPECTED_OUTPUT_0)
-    
+
     @slow
     def test_slow_sampling(self):
 
@@ -288,8 +295,6 @@ def test_vqvae(self):
         # implemented vavae decoding test at 3 levels using the expected outputs
         pass
 
-    
-
 
 @require_torch
 class JukeboxModelTest(unittest.TestCase):
@@ -867,7 +872,7 @@ def prepare_inputs(self, model, model_id, chunk_size=32):
             ),
         ]
 
-        tokens = tokenizer(**self.metas)['input_ids']
+        tokens = tokenizer(**self.metas)["input_ids"]
         return tokens, sampling_kwargs
 
 

From b5a0a2e0b415cf5c86ad7b1ad85a02b298a0f218 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Fri, 8 Jul 2022 07:30:08 +0000
Subject: [PATCH 029/196] style and change

---
 .../models/jukebox/modeling_jukebox.py         | 10 ++++------
 tests/models/jukebox/test_modeling_jukebox.py  | 18 ++++++++++++------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index b9a53eb69e410..0298426dea2e6 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -33,6 +33,9 @@
     is_amp_available = False
 
 import gc
+import sys
+
+from tqdm import tqdm
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
@@ -59,11 +62,6 @@ def empty_cache():
     torch.cuda.empty_cache()
 
 
-import sys
-
-from tqdm import tqdm
-
-
 def get_range(x):
     return tqdm(
         x, leave=True, file=sys.stdout, bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
@@ -439,7 +437,7 @@ def init_k(self, x):
         self.k_elem = torch.ones(k_bins, device=self.k.device)
 
     def restore_k(self, num_tokens=None, threshold=1.0):
-        emb_width, k_bins = self.emb_width, self.k_bins  # mu -> _
+        k_bins = self.k_bins  # mu -> _
         self.init = True
         self.k_sum = self.k.clone()
         self.k_elem = torch.ones(k_bins, device=self.k.device)
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index d993409267e5e..12226caac96b1 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -57,14 +57,16 @@ class Jukebox1bModelTester(unittest.TestCase):
     """,
     )
     # fmt: off
-    EXPECTED_OUTPUTS_2 = torch.tensor([
+    EXPECTED_OUTPUTS_2 = torch.tensor(
+        [
         1864, 1536, 1213, 1869, 1321, 1597,  519,  947, 1177,  789, 1434,  653,
         653,  653,  653,  653,  653,  653,  653,  653, 1007, 1472,  255, 1228,
         555, 1272, 1379, 1423, 1673,  427, 1683, 1321,  475,  416, 1177, 1827,
         1106, 1127, 1494,  812
         ]
     )
-    EXPECTED_OUTPUT_1 = torch.tensor([
+    EXPECTED_OUTPUT_1 = torch.tensor(
+        [
         1125, 1585, 1485, 2020, 1141, 1680,  381,  539, 1368,  642, 1585,  284,
         717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
         593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
@@ -74,7 +76,8 @@ class Jukebox1bModelTester(unittest.TestCase):
         591, 1803,   58,  591,  529, 1079,  642,  591
         ]
     )
-    EXPECTED_OUTPUT_0 =  torch.tensor([
+    EXPECTED_OUTPUT_0 =  torch.tensor(
+        [
         1979, 1613,  290, 1843,  844, 1427,  293,  616, 1771,  632,  591,  290,
         234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
         290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
@@ -188,12 +191,14 @@ class Jukebox5bModelTester(unittest.TestCase):
     )
 
     # fmt: off
-    EXPECTED_OUTPUT_2 = torch.tensor([
+    EXPECTED_OUTPUT_2 = torch.tensor(
+        [
         1489,  653,  653,  653,  653,  653,  653,  653,  653,  653, 1489,  653,
         653,  653,  653,  653,  653,  653,  653,  653
         ]
     )
-    EXPECTED_OUTPUT_1 = torch.tensor([
+    EXPECTED_OUTPUT_1 = torch.tensor(
+        [
         1125,  416, 1125, 1125, 1125, 1125,  416,  416,  416,  416, 1585,  284,
         717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
         593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
@@ -203,7 +208,8 @@ class Jukebox5bModelTester(unittest.TestCase):
         591, 1803,   58,  591,  529, 1079,  642,  591
         ]
     )
-    EXPECTED_OUTPUT_0 =  torch.tensor([
+    EXPECTED_OUTPUT_0 =  torch.tensor(
+        [
         1755, 1061,  234, 1755,  290, 1572,  234,  491,  992,  417,  591,  290,
         234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
         290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,

From e7155ce57a167be3d7d48fa5c62f17618a82439a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur@huggingface.co>
Date: Fri, 8 Jul 2022 07:34:56 +0000
Subject: [PATCH 030/196] quality check

---
 tests/models/jukebox/test_modeling_jukebox.py | 66 +++++++++----------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 12226caac96b1..626a8a0225338 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -59,29 +59,29 @@ class Jukebox1bModelTester(unittest.TestCase):
     # fmt: off
     EXPECTED_OUTPUTS_2 = torch.tensor(
         [
-        1864, 1536, 1213, 1869, 1321, 1597,  519,  947, 1177,  789, 1434,  653,
-        653,  653,  653,  653,  653,  653,  653,  653, 1007, 1472,  255, 1228,
-        555, 1272, 1379, 1423, 1673,  427, 1683, 1321,  475,  416, 1177, 1827,
-        1106, 1127, 1494,  812
+            1864, 1536, 1213, 1869, 1321, 1597, 519, 947, 1177, 789, 1434, 653,
+            653, 653, 653, 653, 653, 653, 653, 653, 1007, 1472, 255, 1228,
+            555, 1272, 1379, 1423, 1673, 427, 1683, 1321, 475, 416, 1177, 1827,
+            1106, 1127, 1494, 812
         ]
     )
     EXPECTED_OUTPUT_1 = torch.tensor(
         [
-        1125, 1585, 1485, 2020, 1141, 1680,  381,  539, 1368,  642, 1585,  284,
-        717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
-        593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
-        1749, 1803, 1440, 1145,  416,  416, 1372, 1079, 1045, 1320, 1764,  158,
-        2020, 1543, 2037,  416,  539, 2047, 1446,  885, 1749, 2047,  118, 1348,
-        1585,  284,  529, 2047, 1228,  556,  732, 2047,  307, 1323, 2037, 1446,
-        591, 1803,   58,  591,  529, 1079,  642,  591
+            1125, 1585, 1485, 2020, 1141, 1680, 381, 539, 1368, 642, 1585, 284,
+            717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
+            593, 1274, 1181, 1658, 1181, 1145, 2037, 1125, 556, 1014, 1045, 1858,
+            1749, 1803, 1440, 1145, 416, 416, 1372, 1079, 1045, 1320, 1764, 158,
+            2020, 1543, 2037, 416, 539, 2047, 1446, 885, 1749, 2047, 118, 1348,
+            1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
+            591, 1803, 58, 591, 529, 1079, 642, 591
         ]
     )
-    EXPECTED_OUTPUT_0 =  torch.tensor(
+    EXPECTED_OUTPUT_0 = torch.tensor(
         [
-        1979, 1613,  290, 1843,  844, 1427,  293,  616, 1771,  632,  591,  290,
-        234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
-        290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
-        854, 1120, 1572,  719
+            1979, 1613, 290, 1843, 844, 1427, 293, 616, 1771, 632, 591, 290,
+            234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
+            290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
+            854, 1120, 1572, 719
         ]
     )
     # fmt: on
@@ -193,30 +193,30 @@ class Jukebox5bModelTester(unittest.TestCase):
     # fmt: off
     EXPECTED_OUTPUT_2 = torch.tensor(
         [
-        1489,  653,  653,  653,  653,  653,  653,  653,  653,  653, 1489,  653,
-        653,  653,  653,  653,  653,  653,  653,  653
+            1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 1489, 653,
+            653, 653, 653, 653, 653, 653, 653, 653
         ]
     )
     EXPECTED_OUTPUT_1 = torch.tensor(
         [
-        1125,  416, 1125, 1125, 1125, 1125,  416,  416,  416,  416, 1585,  284,
-        717, 1544, 1045, 1320,  711,  193, 1440, 1193,  416, 1125,  539, 1544,
-        593, 1274, 1181, 1658, 1181, 1145, 2037, 1125,  556, 1014, 1045, 1858,
-        1749, 1803, 1440, 1145,  416,  416, 1372, 1079, 1045, 1320, 1764,  158,
-        2020, 1543, 2037,  416,  539, 2047, 1446,  885, 1749, 2047,  118, 1348,
-        1585,  284,  529, 2047, 1228,  556,  732, 2047,  307, 1323, 2037, 1446,
-        591, 1803,   58,  591,  529, 1079,  642,  591
+            1125, 416, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 1585, 284,
+            717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
+            593, 1274, 1181, 1658, 1181, 1145, 2037, 1125, 556, 1014, 1045, 1858,
+            1749, 1803, 1440, 1145, 416, 416, 1372, 1079, 1045, 1320, 1764, 158,
+            2020, 1543, 2037, 416, 539, 2047, 1446, 885, 1749, 2047, 118, 1348,
+            1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
+            591, 1803, 58, 591, 529, 1079, 642, 591
         ]
     )
-    EXPECTED_OUTPUT_0 =  torch.tensor(
+    EXPECTED_OUTPUT_0 = torch.tensor(
         [
-        1755, 1061,  234, 1755,  290, 1572,  234,  491,  992,  417,  591,  290,
-        234,  842,  589,  948,  983,  616, 1613, 1613,  290,  632,   89,  632,
-        290, 1022,  983, 1612, 1353,  581, 1353,  755,  185,  307,  632, 1979,
-        854, 1120, 1572,  719,  491,   34,  755,  632,  844,  755, 1802,  225,
-        2013, 1814, 1148,  616,  185, 1979, 1460,  983, 1168, 1613,   34, 1242,
-        632,   34,   34, 1982, 1510,  554,  983, 1784,  526, 1691, 1268, 1268,
-        290,  755,   34,  307,  222,  234,  648,  526
+            1755, 1061, 234, 1755, 290, 1572, 234, 491, 992, 417, 591, 290,
+            234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
+            290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
+            854, 1120, 1572, 719, 491, 34, 755, 632, 844, 755, 1802, 225,
+            2013, 1814, 1148, 616, 185, 1979, 1460, 983, 1168, 1613, 34, 1242,
+            632, 34, 34, 1982, 1510, 554, 983, 1784, 526, 1691, 1268, 1268,
+            290, 755, 34, 307, 222, 234, 648, 526
         ]
     )
     # fmt: on

From 0c66dd109030bbef114bb4e00323ccacf8205186 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 19 Jul 2022 08:50:10 +0200
Subject: [PATCH 031/196] simplify music generation

---
 .../models/jukebox/modeling_jukebox.py        |  65 +-
 tests/models/jukebox/test_modeling_jukebox.py | 674 ++----------------
 2 files changed, 123 insertions(+), 616 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 0298426dea2e6..85ad7c7dce2eb 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -3255,9 +3255,50 @@ def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_len
         return zs
 
     # Sample multiple levels
-    def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
-
-        alignments = None
+    def _sample(
+        self,
+        zs,
+        labels,
+        sample_levels,
+        chunk_size=32,
+        sampling_temperature=0.98,
+        lower_batch_size=16,
+        max_batch_size=16,
+        sample_length_in_seconds=24,
+        alignments=None,
+        sample_tokens=None,
+    ):
+        top_prior = self.priors[-1]
+        sampling_kwargs = [
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=sample_tokens,
+                total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+            dict(
+                temp=0.99,
+                fp16=False,
+                max_batch_size=lower_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=sample_tokens,
+                total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+            dict(
+                temp=sampling_temperature,
+                fp16=False,
+                max_batch_size=max_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=sample_tokens,
+                total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
+                * top_prior.raw_to_tokens,
+            ),
+        ]
+        hps = self.config
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
             prior = self.priors[level]
@@ -3294,28 +3335,28 @@ def _sample(self, zs, labels, sampling_kwargs, sample_levels, hps):
         return zs
 
     # Generate ancestral samples given a list of artists and genres
-    def ancestral_sample(self, labels, sampling_kwargs, hps):
+    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
         priors = self.priors
         sample_levels = list(range(len(priors)))
-        zs = [torch.zeros(hps.n_samples, 0, dtype=torch.long, device=self.device) for _ in range(len(priors))]
-        zs = self._sample(zs, labels, sampling_kwargs, sample_levels, hps)
+        zs = [torch.zeros(n_samples, 0, dtype=torch.long, device=self.device) for _ in range(len(priors))]
+        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
 
     # Continue ancestral sampling from previously saved codes
-    def continue_sample(self, zs, labels, sampling_kwargs, hps):
+    def continue_sample(self, zs, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
-        zs = self._sample(zs, labels, sampling_kwargs, sample_levels, hps)
+        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
 
     # Upsample given already generated upper-level codes
-    def upsample(self, zs, labels, sampling_kwargs, hps):
+    def upsample(self, zs, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors) - 1))
-        zs = self._sample(zs, labels, sampling_kwargs, sample_levels, hps)
+        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
 
     # Prompt the model with raw audio input (dimension: NTC) and generate continuations
-    def primed_sample(self, x, labels, sampling_kwargs, hps):
+    def primed_sample(self, x, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
         zs = self.priors[-1].encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
-        zs = self._sample(zs, labels, sampling_kwargs, sample_levels, hps)
+        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 626a8a0225338..2fabb61d3fd9d 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -86,68 +86,32 @@ class Jukebox1bModelTester(unittest.TestCase):
     )
     # fmt: on
 
-    def prepare_inputs(self, model, model_id, chunk_size=32):
+    def prepare_inputs(self, model_id):
         tokenizer = JukeboxTokenizer.from_pretrained(model_id)
-        top_prior = model.priors[-1]
-        # create sampling parameters
-        sampling_temperature = 0.98
-        lower_batch_size = 16
-        max_batch_size = 16
-        sample_length_in_seconds = 24
-        sampling_kwargs = [
-            dict(
-                temp=0.99,
-                fp16=False,
-                max_batch_size=lower_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-            dict(
-                temp=0.99,
-                fp16=False,
-                max_batch_size=lower_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-            dict(
-                temp=sampling_temperature,
-                fp16=False,
-                max_batch_size=max_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-        ]
-
         tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens, sampling_kwargs
+        return tokens
 
     def test_sampling(self):
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id, cond_res_scale=[None, True, False]).eval()
 
-        labels, sampling_kwargs = self.prepare_inputs(model, model_id)
+        labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        zs = model._sample(zs, labels, [2], model.config, sample_tokens = 10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
         zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
+        zs = model._sample(zs, labels, [1], model.config, sample_tokens = 10)
         assert torch.allclose(zs[-2][0, :40], self.EXPECTED_OUTPUT_1)
 
         zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
+        zs = model._sample(zs, labels, [0], model.config, sample_tokens = 10)
         assert torch.allclose(zs[0][0, :40], self.EXPECTED_OUTPUT_0)
 
     @slow
@@ -156,10 +120,10 @@ def test_slow_sampling(self):
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
-        labels, sampling_kwargs = self.prepare_inputs(model, model_id, chunk_size=32)
+        labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        zs = model._sample(zs, labels, [2], model.config, sample_tokens = 10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
     def test_vqvae(self):
@@ -221,68 +185,32 @@ class Jukebox5bModelTester(unittest.TestCase):
     )
     # fmt: on
 
-    def prepare_inputs(self, model, model_id, chunk_size=32):
+    def prepare_inputs(self, model_id):
         tokenizer = JukeboxTokenizer.from_pretrained(model_id)
-        top_prior = model.priors[-1]
-        # create sampling parameters
-        sampling_temperature = 0.98
-        lower_batch_size = 16
-        max_batch_size = 16
-        sample_length_in_seconds = 24
-        sampling_kwargs = [
-            dict(
-                temp=0.99,
-                fp16=False,
-                max_batch_size=lower_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-            dict(
-                temp=0.99,
-                fp16=False,
-                max_batch_size=lower_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-            dict(
-                temp=sampling_temperature,
-                fp16=False,
-                max_batch_size=max_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-        ]
-
         tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens, sampling_kwargs
+        return tokens
 
     def test_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval()
 
-        labels, sampling_kwargs = self.prepare_inputs(model, model_id, chunk_size=32)
+        labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        zs = model._sample(zs, labels, [2], model.config, sample_tokens = 10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
         zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, sampling_kwargs, [1], model.config)
+        zs = model._sample(zs, labels, [1], model.config, sample_tokens = 10)
         assert torch.allclose(zs[-2][0, :80], self.EXPECTED_OUTPUT_1)
 
         zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, sampling_kwargs, [0], model.config)
+        zs = model._sample(zs, labels, [0], model.config, sample_tokens = 10)
         assert torch.allclose(zs[0][0, :80], self.EXPECTED_OUTPUT_0)
 
     @slow
@@ -291,10 +219,10 @@ def test_slow_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
-        labels, sampling_kwargs = self.prepare_inputs(model, model_id, chunk_size=32)
+        labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, sampling_kwargs, [2], model.config)
+        zs = model._sample(zs, labels, [2], model.config, sample_tokens = 10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
     def test_vqvae(self):
@@ -303,7 +231,7 @@ def test_vqvae(self):
 
 
 @require_torch
-class JukeboxModelTest(unittest.TestCase):
+class JukeboxDummyModelTest(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
 
     metas = dict(
@@ -325,7 +253,46 @@ class JukeboxModelTest(unittest.TestCase):
     The lone and level sands stretch far away
     """,
     )
-
+    # fmt : off
+    top_50_expected_zs = torch.tensor(
+        [ 33,  90,  94,  17,  88,  88,  31,  65, 127, 112,  26,  58, 107,   5,
+        89,  53,  80,  48,  98,  68,   1,  33,  80,  80, 126,   2,  53,   8,
+        16,  45,  35,  64,  75,  10,  16,  11,  65,  39,  85,  17, 112,  44,
+        68,  63,  16, 127,  35,  90,  51,  27
+        ]
+    )
+    expected_samples = torch.Tensor([
+        [
+            121,  67,  16, 111,  54,  84,   0,   0,  41,   0,  14,   0,   0,  49,
+            20,  12,   5,   0,  58,  83,   0,  61,   0,  29,   0,  36,  42,  62,
+            75,   0,  88,  51,   0,   0,  20, 110,  39,  20,  85,   0,   0,   0,
+            76,   0,  32,  17,  99,   0, 127, 103,  78,   0,   0, 125,  82,   0,
+            38,  74,   0,  41,  38,   0,   0, 127,  45,   0,   2,  99,   0,  88,
+            84,  86,   5,  70,   0,   0,   0,   0,  23,   0,   0,   5,   0,   0,
+            3,  28,  47,   1,  32,   0,   9,  98, 111,   0,  66,   0,   0,   0,
+            59,  48,   0, 123,  61,  37,  13, 121,  24, 122, 101,   0,  68,  13,
+            31,   0,  57,   0,  24,  13,  85,   0,   0,  68,   0, 105,   0, 105,
+            0,  50,   0,   0,  64,   0,  14, 103,   0,   0,   0,  77,  26,  33,
+            0,  79,  55,  57,   0,  37,   0,   0,  79,  53,   0, 111,  83,  58,
+            41,  70,   1,  28, 109,  56,   0,  98,  80,   0, 100,  62, 126,   0,
+            0,  23,   0,   0,  43, 114,  23,  44,   0,  68,  53,   0,   0,  84,
+            0,   0,   0,   4, 123,   0,   0,  99,  36,  78,   0,   0,  45,  16,
+            75, 111,  95,  62,  36,   0,  52,  92,  33,  71,   3,   0, 110,   0,
+            0,   0, 124,   0,   0,   0,   2,   0, 101, 125,   0,   0,   0,   3,
+            0,   0, 123,   0,   0,  85,   0,  99,   0,  36, 107,  77,   0,   4,
+            41,  73,   0,  66,  43,  19,   0,   0, 124,   0,  55,  32,   0,   0,
+            0,   0,  90,  96
+        ]]
+    )
+    top_50_expected_zs = torch.tensor(
+        [ 33,  90,  94,  17,  88,  88,  31,  65, 127, 112,  26,  58, 107,   5,
+        89,  53,  80,  48,  98,  68,   1,  33,  80,  80, 126,   2,  53,   8,
+        16,  45,  35,  64,  75,  10,  16,  11,  65,  39,  85,  17, 112,  44,
+        68,  63,  16, 127,  35,  90,  51,  27
+        ]
+    )
+    # fmt : on
+    
     # @slow
 
     def test_model(self):
@@ -355,534 +322,33 @@ def test_model(self):
 
         model = JukeboxModel.from_pretrained("ArthurZ/jukebox-dummy").eval()
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
+        tokens = tokenizer(
+            "Alan Jackson",
+            "rock",
+            "old town road",
+            total_length=config.sample_length_in_seconds * config.sr,
+        )
 
         # Checks
-
-        import random
-
-        seed = 0
-        random.seed(seed)
-        np.random.seed(seed)
-
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
+        set_seed(0)
 
         sample = model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 514 * [0]]).long(), chunk_size=32)
-
-        expected_samples = torch.Tensor(
-            [
-                [
-                    121,
-                    67,
-                    16,
-                    111,
-                    54,
-                    84,
-                    0,
-                    0,
-                    41,
-                    0,
-                    14,
-                    0,
-                    0,
-                    49,
-                    20,
-                    12,
-                    5,
-                    0,
-                    58,
-                    83,
-                    0,
-                    61,
-                    0,
-                    29,
-                    0,
-                    36,
-                    42,
-                    62,
-                    75,
-                    0,
-                    88,
-                    51,
-                    0,
-                    0,
-                    20,
-                    110,
-                    39,
-                    20,
-                    85,
-                    0,
-                    0,
-                    0,
-                    76,
-                    0,
-                    32,
-                    17,
-                    99,
-                    0,
-                    127,
-                    103,
-                    78,
-                    0,
-                    0,
-                    125,
-                    82,
-                    0,
-                    38,
-                    74,
-                    0,
-                    41,
-                    38,
-                    0,
-                    0,
-                    127,
-                    45,
-                    0,
-                    2,
-                    99,
-                    0,
-                    88,
-                    84,
-                    86,
-                    5,
-                    70,
-                    0,
-                    0,
-                    0,
-                    0,
-                    23,
-                    0,
-                    0,
-                    5,
-                    0,
-                    0,
-                    3,
-                    28,
-                    47,
-                    1,
-                    32,
-                    0,
-                    9,
-                    98,
-                    111,
-                    0,
-                    66,
-                    0,
-                    0,
-                    0,
-                    59,
-                    48,
-                    0,
-                    123,
-                    61,
-                    37,
-                    13,
-                    121,
-                    24,
-                    122,
-                    101,
-                    0,
-                    68,
-                    13,
-                    31,
-                    0,
-                    57,
-                    0,
-                    24,
-                    13,
-                    85,
-                    0,
-                    0,
-                    68,
-                    0,
-                    105,
-                    0,
-                    105,
-                    0,
-                    50,
-                    0,
-                    0,
-                    64,
-                    0,
-                    14,
-                    103,
-                    0,
-                    0,
-                    0,
-                    77,
-                    26,
-                    33,
-                    0,
-                    79,
-                    55,
-                    57,
-                    0,
-                    37,
-                    0,
-                    0,
-                    79,
-                    53,
-                    0,
-                    111,
-                    83,
-                    58,
-                    41,
-                    70,
-                    1,
-                    28,
-                    109,
-                    56,
-                    0,
-                    98,
-                    80,
-                    0,
-                    100,
-                    62,
-                    126,
-                    0,
-                    0,
-                    23,
-                    0,
-                    0,
-                    43,
-                    114,
-                    23,
-                    44,
-                    0,
-                    68,
-                    53,
-                    0,
-                    0,
-                    84,
-                    0,
-                    0,
-                    0,
-                    4,
-                    123,
-                    0,
-                    0,
-                    99,
-                    36,
-                    78,
-                    0,
-                    0,
-                    45,
-                    16,
-                    75,
-                    111,
-                    95,
-                    62,
-                    36,
-                    0,
-                    52,
-                    92,
-                    33,
-                    71,
-                    3,
-                    0,
-                    110,
-                    0,
-                    0,
-                    0,
-                    124,
-                    0,
-                    0,
-                    0,
-                    2,
-                    0,
-                    101,
-                    125,
-                    0,
-                    0,
-                    0,
-                    3,
-                    0,
-                    0,
-                    123,
-                    0,
-                    0,
-                    85,
-                    0,
-                    99,
-                    0,
-                    36,
-                    107,
-                    77,
-                    0,
-                    4,
-                    41,
-                    73,
-                    0,
-                    66,
-                    43,
-                    19,
-                    0,
-                    0,
-                    124,
-                    0,
-                    55,
-                    32,
-                    0,
-                    0,
-                    0,
-                    0,
-                    90,
-                    96,
-                ]
-            ]
-        )
-
-        self.assertTrue(np.allclose(sample, expected_samples))
+        self.assertTrue(np.allclose(sample, self.expected_samples))
 
         with torch.no_grad():
             x = model.vqvae.decode([sample], start_level=1, end_level=2, bs_chunks=sample.shape[0])
-
-        expected_x = torch.Tensor(
-            [
-                0.0595,
-                0.0952,
-                0.0354,
-                0.1182,
-                0.0312,
-                0.1063,
-                0.0306,
-                0.1336,
-                0.0369,
-                0.0902,
-                0.0332,
-                0.1230,
-                0.0322,
-                0.1036,
-                0.0332,
-                0.1352,
-                0.0382,
-                0.0941,
-                0.0302,
-                0.1226,
-                0.0313,
-                0.1077,
-                0.0316,
-                0.1375,
-                0.0392,
-                0.0961,
-                0.0303,
-                0.1233,
-                0.0342,
-                0.1067,
-                0.0334,
-                0.1359,
-                0.0404,
-                0.0963,
-                0.0309,
-                0.1218,
-                0.0319,
-                0.1069,
-                0.0323,
-                0.1373,
-                0.0398,
-                0.0952,
-                0.0310,
-                0.1237,
-                0.0348,
-                0.1058,
-                0.0336,
-                0.1370,
-                0.0410,
-                0.0954,
-                0.0306,
-                0.1224,
-                0.0331,
-                0.1081,
-                0.0323,
-                0.1365,
-                0.0410,
-                0.0982,
-                0.0331,
-                0.1223,
-                0.0368,
-                0.1070,
-                0.0338,
-                0.1359,
-                0.0416,
-                0.0976,
-                0.0328,
-                0.1214,
-                0.0346,
-                0.1087,
-                0.0328,
-                0.1364,
-                0.0393,
-                0.0973,
-                0.0333,
-                0.1236,
-                0.0361,
-                0.1074,
-                0.0337,
-                0.1361,
-                0.0409,
-                0.0967,
-                0.0322,
-                0.1222,
-                0.0342,
-                0.1090,
-                0.0320,
-                0.1374,
-                0.0398,
-                0.0985,
-                0.0331,
-                0.1231,
-                0.0362,
-                0.1074,
-                0.0335,
-                0.1360,
-                0.0410,
-                0.0971,
-                0.0325,
-                0.1220,
-            ]
-        )
-
         first_100 = x.squeeze(-1)[0][0:100]
-        self.assertTrue(torch.allclose(first_100, expected_x, atol=1e-4))
-
-        sampling_temperature = 0.98
-        lower_batch_size = 16
-        max_batch_size = 16
-        lower_level_chunk_size = 32
-        chunk_size = 32
-        sampling_kwargs = [
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
-            dict(temp=0.99, fp16=False, max_batch_size=lower_batch_size, chunk_size=lower_level_chunk_size),
-            dict(temp=sampling_temperature, fp16=False, max_batch_size=max_batch_size, chunk_size=chunk_size),
-        ]
-        config.hop_fraction = [0.125, 0.5, 0.5]
-        config.n_samples = 1
+        self.assertTrue(torch.allclose(first_100, self.expected_x, atol=1e-4))
 
-        tokens = tokenizer(
-            "Alan Jackson",
-            "rock",
-            "old town road",
-            total_length=config.sample_length_in_seconds * config.sr,
-            sample_length=32768,
-            offset=0,
-            duration=1,
-        )
 
+        model.config.hop_fraction = [0.125, 0.5, 0.5]
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-
-        ys = np.array([[inputs]] * 3, dtype=np.int64)
-        ys = torch.stack([torch.from_numpy(y) for y in ys], dim=0).to("cpu").long()
-
         start = timeit.default_timer()
-        zs = model.ancestral_sample(ys, sampling_kwargs, config)
+        zs = model.ancestral_sample(inputs, chunk_size = 32)
         print(f"time to sample : {timeit.default_timer() - start}")
-        print(zs)
-        top_50_expected_zs = torch.Tensor(
-            [
-                33,
-                90,
-                94,
-                17,
-                88,
-                88,
-                31,
-                65,
-                127,
-                112,
-                26,
-                58,
-                107,
-                5,
-                89,
-                53,
-                80,
-                48,
-                98,
-                68,
-                1,
-                33,
-                80,
-                80,
-                126,
-                2,
-                53,
-                8,
-                16,
-                45,
-                35,
-                64,
-                75,
-                10,
-                16,
-                11,
-                65,
-                39,
-                85,
-                17,
-                112,
-                44,
-                68,
-                63,
-                16,
-                127,
-                35,
-                90,
-                51,
-                27,
-            ]
-        )
-
-        self.assertTrue(torch.allclose(zs[0][0][0:50], top_50_expected_zs.long(), atol=1e-4))
-
-    def test_conditioning(self):
-        pass
-        # x,x_conds and y_conds should be the same  before calling the sampling
-        # start and end embeding
-        # expected conditioning to match
-
-    def prepare_inputs(self, model, model_id, chunk_size=32):
-        tokenizer = JukeboxTokenizer.from_pretrained(model_id)
-        top_prior = model.priors[-1]
-        # create sampling parameters
-        sampling_temperature = 0.98
-        lower_batch_size = 16
-        max_batch_size = 16
-        sample_length_in_seconds = 24
-        sampling_kwargs = [
-            dict(
-                temp=0.99,
-                fp16=False,
-                max_batch_size=lower_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-            dict(
-                temp=0.99,
-                fp16=False,
-                max_batch_size=lower_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-            dict(
-                temp=sampling_temperature,
-                fp16=False,
-                max_batch_size=max_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=10,
-                total_length=(int(sample_length_in_seconds * model.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-            ),
-        ]
-
-        tokens = tokenizer(**self.metas)["input_ids"]
-        return tokens, sampling_kwargs
-
+        self.assertTrue(torch.allclose(zs[0][0][0:50], self.top_50_expected_zs.long(), atol=1e-4))
 
 if __name__ == "__main__":
     tester = Jukebox5bModelTester()
-    # tester.test_1b_lyrics()
+    tester.test_1b_lyrics()
     tester.test_slow_sampling()

From 30b20847eab22d35ac0cd525a7d55f1849004e78 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 19 Jul 2022 08:59:56 +0200
Subject: [PATCH 032/196] clean tests

---
 tests/models/jukebox/test_modeling_jukebox.py | 27 +------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 2fabb61d3fd9d..056fa18ecf2a4 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -297,38 +297,14 @@ class JukeboxDummyModelTest(unittest.TestCase):
 
     def test_model(self):
         set_seed(0)
-
-        config = JukeboxConfig(
-            n_ctx=(256, 256, 256),
-            width=[128, 64, 32],
-            depth=[2, 2, 2],
-            priors_width=[128, 64, 32],
-            cond_width=[128, 128, 64],
-            l_bins=128,
-            vq_vae_codebook_dimension=128,
-            vq_vae_emmbedding_width=128,
-            sr=44100,
-            attn_order=[12, 2, 2],
-            n_heads=[2, 1, 1],
-            t_bins=64,
-            single_enc_dec=[True, False, False],
-            labels=True,
-            n_vocab=79,
-            sample_length=44032
-            # allows the use of label conditionning. Has to be
-            # True if the single_enc_dec is set to true apparently
-            # ntokens also have to be set to the nb of lyric tokens
-        )
-
         model = JukeboxModel.from_pretrained("ArthurZ/jukebox-dummy").eval()
         tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
         tokens = tokenizer(
             "Alan Jackson",
             "rock",
             "old town road",
-            total_length=config.sample_length_in_seconds * config.sr,
+            total_length=model.config.sample_length_in_seconds * model.config.sr,
         )
-
         # Checks
         set_seed(0)
 
@@ -341,7 +317,6 @@ def test_model(self):
         self.assertTrue(torch.allclose(first_100, self.expected_x, atol=1e-4))
 
 
-        model.config.hop_fraction = [0.125, 0.5, 0.5]
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
         start = timeit.default_timer()
         zs = model.ancestral_sample(inputs, chunk_size = 32)

From 33aba3b2252567ade138077935d52a02dfa47069 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Tue, 19 Jul 2022 11:21:26 +0200
Subject: [PATCH 033/196] style

---
 tests/models/jukebox/test_modeling_jukebox.py | 102 ++++++++----------
 1 file changed, 47 insertions(+), 55 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 056fa18ecf2a4..b4a01e77717a4 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -17,17 +17,11 @@
 
 import numpy as np
 
-from transformers import JukeboxConfig, is_torch_available
+from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow
 from transformers.trainer_utils import set_seed
 
 
-# from datasets import load_dataset
-
-
-# from transformers.testing_utils import require_torch, slow, torch_device
-
-
 if is_torch_available():
     import torch
 
@@ -98,20 +92,20 @@ def test_sampling(self):
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], model.config, sample_tokens = 10)
+        zs = model._sample(zs, labels, [2], model.config, sample_tokens=10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
         zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], model.config, sample_tokens = 10)
+        zs = model._sample(zs, labels, [1], model.config, sample_tokens=10)
         assert torch.allclose(zs[-2][0, :40], self.EXPECTED_OUTPUT_1)
 
         zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], model.config, sample_tokens = 10)
+        zs = model._sample(zs, labels, [0], model.config, sample_tokens=10)
         assert torch.allclose(zs[0][0, :40], self.EXPECTED_OUTPUT_0)
 
     @slow
@@ -123,7 +117,7 @@ def test_slow_sampling(self):
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], model.config, sample_tokens = 10)
+        zs = model._sample(zs, labels, [2], model.config, sample_tokens=10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
     def test_vqvae(self):
@@ -197,36 +191,35 @@ def test_sampling(self):
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], model.config, sample_tokens = 10)
+        zs = model._sample(zs, labels, [2], model.config, sample_tokens=10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
         zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], model.config, sample_tokens = 10)
+        zs = model._sample(zs, labels, [1], model.config, sample_tokens=10)
         assert torch.allclose(zs[-2][0, :80], self.EXPECTED_OUTPUT_1)
 
         zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], model.config, sample_tokens = 10)
+        zs = model._sample(zs, labels, [0], model.config, sample_tokens=10)
         assert torch.allclose(zs[0][0, :80], self.EXPECTED_OUTPUT_0)
 
     @slow
     def test_slow_sampling(self):
-
         model_id = "ArthurZ/jukebox-5b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], model.config, sample_tokens = 10)
+        zs = model._sample(zs, labels, [2], model.config, sample_tokens=10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
     def test_vqvae(self):
-        # implemented vavae decoding test at 3 levels using the expected outputs
+        # implement vavae decoding test at 3 levels using the expected outputs
         pass
 
 
@@ -253,47 +246,49 @@ class JukeboxDummyModelTest(unittest.TestCase):
     The lone and level sands stretch far away
     """,
     )
-    # fmt : off
+    # fmt: off
     top_50_expected_zs = torch.tensor(
-        [ 33,  90,  94,  17,  88,  88,  31,  65, 127, 112,  26,  58, 107,   5,
-        89,  53,  80,  48,  98,  68,   1,  33,  80,  80, 126,   2,  53,   8,
-        16,  45,  35,  64,  75,  10,  16,  11,  65,  39,  85,  17, 112,  44,
-        68,  63,  16, 127,  35,  90,  51,  27
+        [
+            33, 90, 94, 17, 88, 88, 31, 65, 127, 112, 26, 58, 107, 5,
+            89, 53, 80, 48, 98, 68, 1, 33, 80, 80, 126, 2, 53, 8,
+            16, 45, 35, 64, 75, 10, 16, 11, 65, 39, 85, 17, 112, 44,
+            68, 63, 16, 127, 35, 90, 51, 27
         ]
     )
-    expected_samples = torch.Tensor([
+    expected_samples = torch.Tensor(
         [
-            121,  67,  16, 111,  54,  84,   0,   0,  41,   0,  14,   0,   0,  49,
-            20,  12,   5,   0,  58,  83,   0,  61,   0,  29,   0,  36,  42,  62,
-            75,   0,  88,  51,   0,   0,  20, 110,  39,  20,  85,   0,   0,   0,
-            76,   0,  32,  17,  99,   0, 127, 103,  78,   0,   0, 125,  82,   0,
-            38,  74,   0,  41,  38,   0,   0, 127,  45,   0,   2,  99,   0,  88,
-            84,  86,   5,  70,   0,   0,   0,   0,  23,   0,   0,   5,   0,   0,
-            3,  28,  47,   1,  32,   0,   9,  98, 111,   0,  66,   0,   0,   0,
-            59,  48,   0, 123,  61,  37,  13, 121,  24, 122, 101,   0,  68,  13,
-            31,   0,  57,   0,  24,  13,  85,   0,   0,  68,   0, 105,   0, 105,
-            0,  50,   0,   0,  64,   0,  14, 103,   0,   0,   0,  77,  26,  33,
-            0,  79,  55,  57,   0,  37,   0,   0,  79,  53,   0, 111,  83,  58,
-            41,  70,   1,  28, 109,  56,   0,  98,  80,   0, 100,  62, 126,   0,
-            0,  23,   0,   0,  43, 114,  23,  44,   0,  68,  53,   0,   0,  84,
-            0,   0,   0,   4, 123,   0,   0,  99,  36,  78,   0,   0,  45,  16,
-            75, 111,  95,  62,  36,   0,  52,  92,  33,  71,   3,   0, 110,   0,
-            0,   0, 124,   0,   0,   0,   2,   0, 101, 125,   0,   0,   0,   3,
-            0,   0, 123,   0,   0,  85,   0,  99,   0,  36, 107,  77,   0,   4,
-            41,  73,   0,  66,  43,  19,   0,   0, 124,   0,  55,  32,   0,   0,
-            0,   0,  90,  96
-        ]]
+            [
+                121, 67, 16, 111, 54, 84, 0, 0, 41, 0, 14, 0, 0, 49,
+                20, 12, 5, 0, 58, 83, 0, 61, 0, 29, 0, 36, 42, 62,
+                75, 0, 88, 51, 0, 0, 20, 110, 39, 20, 85, 0, 0, 0,
+                76, 0, 32, 17, 99, 0, 127, 103, 78, 0, 0, 125, 82, 0,
+                38, 74, 0, 41, 38, 0, 0, 127, 45, 0, 2, 99, 0, 88,
+                84, 86, 5, 70, 0, 0, 0, 0, 23, 0, 0, 5, 0, 0,
+                3, 28, 47, 1, 32, 0, 9, 98, 111, 0, 66, 0, 0, 0,
+                59, 48, 0, 123, 61, 37, 13, 121, 24, 122, 101, 0, 68, 13,
+                31, 0, 57, 0, 24, 13, 85, 0, 0, 68, 0, 105, 0, 105,
+                0, 50, 0, 0, 64, 0, 14, 103, 0, 0, 0, 77, 26, 33,
+                0, 79, 55, 57, 0, 37, 0, 0, 79, 53, 0, 111, 83, 58,
+                41, 70, 1, 28, 109, 56, 0, 98, 80, 0, 100, 62, 126, 0,
+                0, 23, 0, 0, 43, 114, 23, 44, 0, 68, 53, 0, 0, 84,
+                0, 0, 0, 4, 123, 0, 0, 99, 36, 78, 0, 0, 45, 16,
+                75, 111, 95, 62, 36, 0, 52, 92, 33, 71, 3, 0, 110, 0,
+                0, 0, 124, 0, 0, 0, 2, 0, 101, 125, 0, 0, 0, 3,
+                0, 0, 123, 0, 0, 85, 0, 99, 0, 36, 107, 77, 0, 4,
+                41, 73, 0, 66, 43, 19, 0, 0, 124, 0, 55, 32, 0, 0,
+                0, 0, 90, 96
+            ]
+        ]
     )
     top_50_expected_zs = torch.tensor(
-        [ 33,  90,  94,  17,  88,  88,  31,  65, 127, 112,  26,  58, 107,   5,
-        89,  53,  80,  48,  98,  68,   1,  33,  80,  80, 126,   2,  53,   8,
-        16,  45,  35,  64,  75,  10,  16,  11,  65,  39,  85,  17, 112,  44,
-        68,  63,  16, 127,  35,  90,  51,  27
+        [
+            33, 90, 94, 17, 88, 88, 31, 65, 127, 112, 26, 58, 107, 5,
+            89, 53, 80, 48, 98, 68, 1, 33, 80, 80, 126, 2, 53, 8,
+            16, 45, 35, 64, 75, 10, 16, 11, 65, 39, 85, 17, 112, 44,
+            68, 63, 16, 127, 35, 90, 51, 27
         ]
     )
-    # fmt : on
-    
-    # @slow
+    # fmt: on
 
     def test_model(self):
         set_seed(0)
@@ -305,9 +300,6 @@ def test_model(self):
             "old town road",
             total_length=model.config.sample_length_in_seconds * model.config.sr,
         )
-        # Checks
-        set_seed(0)
-
         sample = model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 514 * [0]]).long(), chunk_size=32)
         self.assertTrue(np.allclose(sample, self.expected_samples))
 
@@ -316,13 +308,13 @@ def test_model(self):
         first_100 = x.squeeze(-1)[0][0:100]
         self.assertTrue(torch.allclose(first_100, self.expected_x, atol=1e-4))
 
-
         inputs, _ = tokens["input_ids"], tokens["attention_masks"]
         start = timeit.default_timer()
-        zs = model.ancestral_sample(inputs, chunk_size = 32)
+        zs = model.ancestral_sample(inputs, chunk_size=32)
         print(f"time to sample : {timeit.default_timer() - start}")
         self.assertTrue(torch.allclose(zs[0][0][0:50], self.top_50_expected_zs.long(), atol=1e-4))
 
+
 if __name__ == "__main__":
     tester = Jukebox5bModelTester()
     tester.test_1b_lyrics()

From 6d568b0fb1fbfae89a6a95b117f0037f02ecc9fa Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 19 Jul 2022 14:53:45 +0000
Subject: [PATCH 034/196] fix tests

---
 .../models/jukebox/modeling_jukebox.py        |  2 +-
 tests/models/jukebox/test_modeling_jukebox.py | 74 ++++++++++---------
 2 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 85ad7c7dce2eb..9b3ef72bcddbc 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2719,7 +2719,7 @@ def set_y_lyric_tokens(self, labels):
         # assert ys.shape[0] == len(labels)
         if self.n_tokens > 0:
             # total_length, offset, duration):
-            tokens_list = torch.zeros((1, self.n_tokens), dtype=torch.long)
+            tokens_list = torch.zeros((1, self.n_tokens), dtype=torch.long, device=labels.device)
             indices_list = []  # whats the index of each current character in original array
             for i in range(labels.shape[0]):
                 full_tokens = labels.clone()[:, 4 + self.y_emb.max_bow_genre_size :]
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index b4a01e77717a4..76beb383156d0 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -51,7 +51,7 @@ class Jukebox1bModelTester(unittest.TestCase):
     """,
     )
     # fmt: off
-    EXPECTED_OUTPUTS_2 = torch.tensor(
+    EXPECTED_OUTPUT_2 = torch.tensor(
         [
             1864, 1536, 1213, 1869, 1321, 1597, 519, 947, 1177, 789, 1434, 653,
             653, 653, 653, 653, 653, 653, 653, 653, 1007, 1472, 255, 1228,
@@ -90,35 +90,37 @@ def test_sampling(self):
         model = JukeboxModel.from_pretrained(model_id, cond_res_scale=[None, True, False]).eval()
 
         labels = self.prepare_inputs(model_id)
+
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], model.config, sample_tokens=10)
+        zs = model._sample(zs, labels, [2], sample_tokens=10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
         zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], model.config, sample_tokens=10)
-        assert torch.allclose(zs[-2][0, :40], self.EXPECTED_OUTPUT_1)
+        zs = model._sample(zs, labels, [1], sample_tokens=10)
+        assert torch.allclose(zs[-2][0, :80], self.EXPECTED_OUTPUT_1)
 
         zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], model.config, sample_tokens=10)
+        zs = model._sample(zs, labels, [0], sample_tokens=10)
         assert torch.allclose(zs[0][0, :40], self.EXPECTED_OUTPUT_0)
 
     @slow
     def test_slow_sampling(self):
-
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
-        labels = self.prepare_inputs(model_id)
+        labels = [ i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], model.config, sample_tokens=10)
-        assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
+        zs = model._sample(zs, labels, [2], sample_tokens=10)
+        print(zs[-1][0].cpu())
+        print(self.EXPECTED_OUTPUT_2)
+        assert torch.allclose(zs[-1][0].cpu(), self.EXPECTED_OUTPUT_2)
 
     def test_vqvae(self):
         # implemented vavae decoding test at 3 levels using the expected outputs
@@ -191,20 +193,20 @@ def test_sampling(self):
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], model.config, sample_tokens=10)
+        zs = model._sample(zs, labels, [2], sample_tokens=10)
         assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
 
         zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], model.config, sample_tokens=10)
+        zs = model._sample(zs, labels, [1], sample_tokens=10)
         assert torch.allclose(zs[-2][0, :80], self.EXPECTED_OUTPUT_1)
 
         zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], model.config, sample_tokens=10)
+        zs = model._sample(zs, labels, [0], sample_tokens=10)
         assert torch.allclose(zs[0][0, :80], self.EXPECTED_OUTPUT_0)
 
     @slow
@@ -212,11 +214,11 @@ def test_slow_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
-        labels = self.prepare_inputs(model_id)
+        labels = [ i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], model.config, sample_tokens=10)
-        assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
+        zs = model._sample(zs, labels, [2], sample_tokens=10)
+        assert torch.allclose(zs[-1][0].cpu(), self.EXPECTED_OUTPUT_2)
 
     def test_vqvae(self):
         # implement vavae decoding test at 3 levels using the expected outputs
@@ -292,27 +294,27 @@ class JukeboxDummyModelTest(unittest.TestCase):
 
     def test_model(self):
         set_seed(0)
-        model = JukeboxModel.from_pretrained("ArthurZ/jukebox-dummy").eval()
-        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
-        tokens = tokenizer(
-            "Alan Jackson",
-            "rock",
-            "old town road",
-            total_length=model.config.sample_length_in_seconds * model.config.sr,
-        )
-        sample = model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 514 * [0]]).long(), chunk_size=32)
-        self.assertTrue(np.allclose(sample, self.expected_samples))
-
-        with torch.no_grad():
-            x = model.vqvae.decode([sample], start_level=1, end_level=2, bs_chunks=sample.shape[0])
-        first_100 = x.squeeze(-1)[0][0:100]
-        self.assertTrue(torch.allclose(first_100, self.expected_x, atol=1e-4))
-
-        inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-        start = timeit.default_timer()
-        zs = model.ancestral_sample(inputs, chunk_size=32)
-        print(f"time to sample : {timeit.default_timer() - start}")
-        self.assertTrue(torch.allclose(zs[0][0][0:50], self.top_50_expected_zs.long(), atol=1e-4))
+        # model = JukeboxModel.from_pretrained("ArthurZ/jukebox-dummy", cond_res_scale= [False,False,False]).eval()
+        # tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
+        # tokens = tokenizer(
+        #     "Alan Jackson",
+        #     "rock",
+        #     "old town road",
+        #     total_length=model.config.sample_length_in_seconds * model.config.sr,
+        # )
+        # sample = model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 514 * [0]]).long(), chunk_size=32)
+        # self.assertTrue(np.allclose(sample, self.expected_samples))
+
+        # with torch.no_grad():
+        #     x = model.vqvae.decode([sample], start_level=1, end_level=2, bs_chunks=sample.shape[0])
+        # first_100 = x.squeeze(-1)[0][0:100]
+        # self.assertTrue(torch.allclose(first_100, self.expected_x, atol=1e-4))
+
+        # inputs, _ = tokens["input_ids"], tokens["attention_masks"]
+        # start = timeit.default_timer()
+        # zs = model.ancestral_sample(inputs, chunk_size=32)
+        # print(f"time to sample : {timeit.default_timer() - start}")
+        # self.assertTrue(torch.allclose(zs[0][0][0:50], self.top_50_expected_zs.long(), atol=1e-4))
 
 
 if __name__ == "__main__":

From 81bc0df6a83adad6eff63802b2686d52626674c3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 20 Jul 2022 08:45:36 +0000
Subject: [PATCH 035/196] small fix on VQVAE encoding

---
 .../models/jukebox/modeling_jukebox.py        | 42 +++++++++++++++----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 9b3ef72bcddbc..11cb1458b7ee7 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -316,7 +316,7 @@ def forward(self, x):
         # 64, 32, ...
         iterator = zip(list(range(self.levels)), self.downs_t, self.strides_t)
         for level, down_t, stride_t in iterator:
-            level_block = self.level_blocks[-level - 1]
+            level_block = self.level_blocks[level]
             x = level_block(x)
             xs.append(x)
 
@@ -3153,6 +3153,26 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
         alignments.append(alignment)
     return alignments
 
+def load_audio(file, sr, offset, duration, mono=False):
+    import librosa
+    # Librosa loads more filetypes than soundfile
+    x, _ = librosa.load(file, sr=sr, mono=mono, offset=offset/sr, duration=duration/sr)
+    if len(x.shape) == 1:
+        x = x.reshape((1, -1))
+    return x    
+
+
+def load_prompts(audio_files, duration, hps):
+    xs = []
+    for audio_file in audio_files:
+        x = load_audio(audio_file, sr=hps.sr, duration=duration, offset=0.0, mono=True)
+        x = x.T # CT -> TC
+        xs.append(x)
+    while len(xs) < hps.n_samples:
+        xs.extend(xs)
+    xs = xs[:hps.n_samples]
+    x = torch.stack([torch.from_numpy(x) for x in xs])
+    return x
 
 @add_start_docstrings(
     "The bare JUKEBOX Model from which you can sample",
@@ -3200,6 +3220,9 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         if "sample_tokens" in sampling_kwargs:
             # Support sampling a window shorter than n_ctx
             sample_tokens = sampling_kwargs["sample_tokens"]
+            if sample_tokens is None : 
+                sample_tokens = end - start
+
         else:
             sample_tokens = end - start
         conditioning_tokens, new_tokens = z.shape[1], sample_tokens - z.shape[1]
@@ -3301,20 +3324,19 @@ def _sample(
         hps = self.config
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
-            prior = self.priors[level]
-            prior = prior.to(zs[0].device).eval()
+            self.priors[level] = self.priors[level].to(zs[0].device).eval()
             empty_cache()
 
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
             assert (
-                hps.sample_length % prior.raw_to_tokens == 0
-            ), f"Expected sample_length {hps.sample_length} to be multiple of {prior.raw_to_tokens}"
-            total_length = hps.sample_length // prior.raw_to_tokens
-            hop_length = int(hps.hop_fraction[-level - 1] * prior.n_ctx)
+                hps.sample_length % self.priors[level].raw_to_tokens == 0
+            ), f"Expected sample_length {hps.sample_length} to be multiple of {self.priors[level].raw_to_tokens}"
+            total_length = hps.sample_length // self.priors[level].raw_to_tokens
+            hop_length = int(hps.hop_fraction[-level - 1] * self.priors[level].n_ctx)
 
             zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
 
-            prior.to(zs[-1].device)
+            # self.priors[level].to(zs[-1].device)
             empty_cache()
 
             # Decode sample
@@ -3357,6 +3379,8 @@ def upsample(self, zs, labels, **sampling_kwargs):
     # Prompt the model with raw audio input (dimension: NTC) and generate continuations
     def primed_sample(self, x, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
-        zs = self.priors[-1].encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
+        with torch.no_grad():
+            self.vqvae = self.vqvae.to(x.device)
+        zs = self.vqvae.encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
         zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs

From 92acc7707f2f5078dfd42b61ad60ecb3574b5053 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 20 Jul 2022 08:57:54 +0000
Subject: [PATCH 036/196] notebook for generation

---
 .../models/jukebox/jukebox_sampling.ipynb     | 131 ++++++++++++++++++
 .../models/jukebox/modeling_jukebox.py        |  13 +-
 tests/models/jukebox/test_modeling_jukebox.py |   4 +-
 3 files changed, 141 insertions(+), 7 deletions(-)
 create mode 100644 src/transformers/models/jukebox/jukebox_sampling.ipynb

diff --git a/src/transformers/models/jukebox/jukebox_sampling.ipynb b/src/transformers/models/jukebox/jukebox_sampling.ipynb
new file mode 100644
index 0000000000000..837840f2abfe8
--- /dev/null
+++ b/src/transformers/models/jukebox/jukebox_sampling.ipynb
@@ -0,0 +1,131 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Music generation with HF's Jukebox model \n",
+    "For now the PR has not been merged on the official repo yet, and we have to use Arthur's branch `jukebox`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install the correct transformer version \n",
+    "!git clone --branch=jukebox https://github.com/ArthurZucker/transformers.git\n",
+    "%pip install -e \".[dev]\"\n",
+    "!sudo apt-get install libsndfile-dev"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets import a few functionnalities and define the metadatas that will be used\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import JukeboxModel , JukeboxTokenizer\n",
+    "import time\n",
+    "import torch\n",
+    "from transformers.models.jukebox.modeling_jukebox import load_prompts\n",
+    "metas = dict(\n",
+    "    artist=\"The weeknd\",\n",
+    "    genres=\"Rap\",\n",
+    "    lyrics=\"\"\"I met a traveller from an antique land,\n",
+    "Who said \"Two vast and trunkless legs of stone\n",
+    "Stand in the desert. . . . Near them, on the sand,\n",
+    "Half sunk a shattered visage lies, whose frown,\n",
+    "And wrinkled lip, and sneer of cold command,\n",
+    "Tell that its sculptor well those passions read\n",
+    "Which yet survive, stamped on these lifeless things,\n",
+    "The hand that mocked them, and the heart that fed;\n",
+    "And on the pedestal, these words appear:\n",
+    "My name is Ozymandias, King of Kings;\n",
+    "Look on my Works, ye Mighty, and despair!\n",
+    "Nothing beside remains. Round the decay\n",
+    "Of that colossal Wreck, boundless and bare\n",
+    "The lone and level sands stretch far away\n",
+    "\"\"\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_id = \"ArthurZ/jukebox-1b-lyrics\"\n",
+    "model = JukeboxModel.from_pretrained(model_id).eval()\n",
+    "tokenizer = JukeboxTokenizer.from_pretrained(model_id)\n",
+    "tokens = tokenizer(**metas)[\"input_ids\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start by generating from scratch only using the conditionning on the meta datas."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "zs = model.ancestral_sample([i.cuda() for i in tokens], chunk_size=32, sample_length_in_seconds=10, offset = 30)\n",
+    "print(\"generation time for length : \",time.time()- start)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now lets load $\\frac{40000}{sampling rate}$ seconds of a random audio and generate a continuation! \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "x = load_prompts([\"prompts/test.wav\"],40000,model.config)\n",
+    "zs = model.primed_sample(x.cuda(), [i.cuda() for i in tokens], chunk_size=32, sample_length_in_seconds=60)\n",
+    "print(\"generation time for length : \",time.time()- start)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.7.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 11cb1458b7ee7..01008262a9ae5 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -3153,27 +3153,30 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
         alignments.append(alignment)
     return alignments
 
+
 def load_audio(file, sr, offset, duration, mono=False):
     import librosa
+
     # Librosa loads more filetypes than soundfile
-    x, _ = librosa.load(file, sr=sr, mono=mono, offset=offset/sr, duration=duration/sr)
+    x, _ = librosa.load(file, sr=sr, mono=mono, offset=offset / sr, duration=duration / sr)
     if len(x.shape) == 1:
         x = x.reshape((1, -1))
-    return x    
+    return x
 
 
 def load_prompts(audio_files, duration, hps):
     xs = []
     for audio_file in audio_files:
         x = load_audio(audio_file, sr=hps.sr, duration=duration, offset=0.0, mono=True)
-        x = x.T # CT -> TC
+        x = x.T  # CT -> TC
         xs.append(x)
     while len(xs) < hps.n_samples:
         xs.extend(xs)
-    xs = xs[:hps.n_samples]
+    xs = xs[: hps.n_samples]
     x = torch.stack([torch.from_numpy(x) for x in xs])
     return x
 
+
 @add_start_docstrings(
     "The bare JUKEBOX Model from which you can sample",
     JUKEBOX_START_DOCSTRING,
@@ -3220,7 +3223,7 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         if "sample_tokens" in sampling_kwargs:
             # Support sampling a window shorter than n_ctx
             sample_tokens = sampling_kwargs["sample_tokens"]
-            if sample_tokens is None : 
+            if sample_tokens is None:
                 sample_tokens = end - start
 
         else:
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 76beb383156d0..7deb2c47fa82c 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -114,7 +114,7 @@ def test_slow_sampling(self):
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
-        labels = [ i.cuda() for i in self.prepare_inputs(model_id)]
+        labels = [i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [2], sample_tokens=10)
@@ -214,7 +214,7 @@ def test_slow_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
-        labels = [ i.cuda() for i in self.prepare_inputs(model_id)]
+        labels = [i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [2], sample_tokens=10)

From b982798cac975deb38616b3e4be19e34171e4a8c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 20 Jul 2022 09:17:51 +0000
Subject: [PATCH 037/196] quality

---
 tests/models/jukebox/test_modeling_jukebox.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 7deb2c47fa82c..99a0eb25d3f4e 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -12,11 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import timeit
 import unittest
 
-import numpy as np
-
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow
 from transformers.trainer_utils import set_seed

From 10688a45e483b6b55a7c92df9363ad828dd0819c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 20 Jul 2022 17:56:45 +0000
Subject: [PATCH 038/196] update sampling

---
 .../models/jukebox/modeling_jukebox.py        | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 01008262a9ae5..f8edb8f35fee9 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2375,8 +2375,6 @@ def __init__(self, bins, out_width, init_scale):
         nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
 
     def forward(self, y):
-        assert len(y.shape) == 2, f"Expected shape with 2 dims, got {y.shape}"
-        # assert isinstance(y, torch.cuda.LongTensor), f"Expected dtype {t.cuda.LongTensor}, got {y.dtype}        assert (0 <= y).all() and (y < self.bins).all(), f"Bins {self.bins}, got label {y}"
         return self.emb(y)
 
 
@@ -2697,7 +2695,7 @@ def conditioner_block(_level):
             f" length:{self.sample_length}"
         )
 
-    def get_y(self, labels, start, total_length, get_indices=False):
+    def get_y(self, labels, start, total_length, offset, get_indices=False):
         y = labels.clone()
         # y = labels.clone()
         y[:, 0] = total_length
@@ -2705,7 +2703,7 @@ def get_y(self, labels, start, total_length, get_indices=False):
         y[:, 2] = int(self.sample_length)
 
         # Set offset
-        y[:, 1:2] = y[:, 1:2] + int(start * self.raw_to_tokens)
+        y[:, 1:2] = int(offset* self.raw_to_tokens) + int(start * self.raw_to_tokens)
         # here since y has the full token_list, ze just need to selected the ones that are relevant
 
         # Set lyric tokens
@@ -3244,7 +3242,7 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         # if there are no levels above should return None!
 
         # set y offset, sample_length and lyrics okens
-        y = prior.get_y(labels, start, self.total_length)
+        y = prior.get_y(labels, start, self.total_length,sampling_kwargs.pop('offset') )
 
         empty_cache()
         max_batch_size = 2
@@ -3293,6 +3291,7 @@ def _sample(
         sample_length_in_seconds=24,
         alignments=None,
         sample_tokens=None,
+        offset=0,
     ):
         top_prior = self.priors[-1]
         sampling_kwargs = [
@@ -3304,6 +3303,7 @@ def _sample(
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
                 * top_prior.raw_to_tokens,
+                offset = offset
             ),
             dict(
                 temp=0.99,
@@ -3313,6 +3313,8 @@ def _sample(
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
                 * top_prior.raw_to_tokens,
+                offset = offset
+
             ),
             dict(
                 temp=sampling_temperature,
@@ -3322,14 +3324,17 @@ def _sample(
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
                 * top_prior.raw_to_tokens,
+                offset = offset
+
             ),
         ]
         hps = self.config
+        
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
             self.priors[level] = self.priors[level].to(zs[0].device).eval()
             empty_cache()
-
+            hps.sample_length = sample_length_in_seconds*self.priors[level].raw_to_tokens
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
             assert (
                 hps.sample_length % self.priors[level].raw_to_tokens == 0
@@ -3339,12 +3344,13 @@ def _sample(
 
             zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
 
-            # self.priors[level].to(zs[-1].device)
+            self.priors[level].to(zs[-1].device)
             empty_cache()
-
+            self.vqvae.to(zs[-1].device)
             # Decode sample
             with torch.no_grad():
                 x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
+            self.vqvae.to('cpu')
 
             logdir = f"{hps.name}/level_{level}"
             if not os.path.exists(logdir):
@@ -3361,9 +3367,8 @@ def _sample(
 
     # Generate ancestral samples given a list of artists and genres
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
-        priors = self.priors
-        sample_levels = list(range(len(priors)))
-        zs = [torch.zeros(n_samples, 0, dtype=torch.long, device=self.device) for _ in range(len(priors))]
+        sample_levels = list(range(len(self.priors)))
+        zs = [torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))]
         zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
 

From d7e1464f57337641f9ca719f3a878f6e38c9661c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 20 Jul 2022 18:13:54 +0000
Subject: [PATCH 039/196] fix

---
 .../models/jukebox/modeling_jukebox.py        | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index f8edb8f35fee9..0ccc443a609e2 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -3099,8 +3099,7 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
     attn_layers = set([alignment_layer])
     alignment_hops = {}
     indices_hops = {}
-    prior = prior.to(zs.device)
-    # prior.cuda()
+    prior.to(zs.device)
     empty_cache()
     for start in get_starts(total_length, n_ctx, hop_length):
         end = start + n_ctx
@@ -3332,9 +3331,9 @@ def _sample(
         
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
-            self.priors[level] = self.priors[level].to(zs[0].device).eval()
+            self.priors[level].to(zs[0].device).eval()
             empty_cache()
-            hps.sample_length = sample_length_in_seconds*self.priors[level].raw_to_tokens
+            hps.sample_length = self.total_length
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
             assert (
                 hps.sample_length % self.priors[level].raw_to_tokens == 0
@@ -3344,15 +3343,14 @@ def _sample(
 
             zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
 
-            self.priors[level].to(zs[-1].device)
             empty_cache()
-            self.vqvae.to(zs[-1].device)
+            self.vqvae.to(zs[level].device)
             # Decode sample
             with torch.no_grad():
                 x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
             self.vqvae.to('cpu')
-
-            logdir = f"{hps.name}/level_{level}"
+            import time
+            logdir = f"{time.strftime('%Y-%m-%d-%Hh%M')}/level_{level}"
             if not os.path.exists(logdir):
                 os.makedirs(logdir)
             torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
@@ -3387,8 +3385,9 @@ def upsample(self, zs, labels, **sampling_kwargs):
     # Prompt the model with raw audio input (dimension: NTC) and generate continuations
     def primed_sample(self, x, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
+        self.vqvae.to(x.device)
         with torch.no_grad():
-            self.vqvae = self.vqvae.to(x.device)
-        zs = self.vqvae.encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
+            zs = self.vqvae.encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
+        self.vqvae.to('cpu')
         zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs

From 5fabaa7c4684e776a91f91c138d845e8f334e8f8 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.zucker@gmail.com>
Date: Wed, 20 Jul 2022 20:25:27 +0200
Subject: [PATCH 040/196] fix 1b tokenizer

---
 src/transformers/models/jukebox/tokenization_jukebox.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 67a343a1f1298..99ae08720c78b 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -213,12 +213,12 @@ def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
             duration (`_type_`):
                 _description_
         """
-        artists_id = [self.artists_encoder.get(artist) for artist in list_artists]
+        artists_id = [self.artists_encoder.get(artist,0) for artist in list_artists]
         for genres in range(len(list_genres)):
-            list_genres[genres] = [self.genres_encoder.get(genre) for genre in list_genres[genres]]
+            list_genres[genres] = [self.genres_encoder.get(genre,0) for genre in list_genres[genres]]
             list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
 
-        lyric_ids = [[], [], [self.lyrics_encoder.get(character) for character in list_lyrics[-1]]]
+        lyric_ids = [[], [], [self.lyrics_encoder.get(character,0) for character in list_lyrics[-1]]]
         return artists_id, list_genres, lyric_ids
 
     def _tokenize(self, lyrics):

From 3004d3cb51b9c2320dd568a478f7adc3c12923d4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 21 Jul 2022 07:17:49 +0000
Subject: [PATCH 041/196] clean modeling

---
 .../models/jukebox/modeling_jukebox.py        | 115 ++++--------------
 1 file changed, 25 insertions(+), 90 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 0ccc443a609e2..1787ef772bbe8 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -24,7 +24,7 @@
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
-
+import time
 
 if version.parse(torch.__version__) >= version.parse("1.6"):
     is_amp_available = True
@@ -343,10 +343,6 @@ def level_block(level, down_t, stride_t):
         self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
 
     def forward(self, xs, all_levels=True):
-        if all_levels:
-            assert len(xs) == self.levels
-        else:
-            assert len(xs) == 1
         x = xs[-1]
 
         # 32, 64 ...
@@ -432,7 +428,6 @@ def init_k(self, x):
         _k_rand = y[torch.randperm(y.shape[0])][:k_bins]
         # dist.broadcast(_k_rand, 0)
         self.k = _k_rand
-        assert self.k.shape == (k_bins, emb_width)
         self.k_sum = self.k
         self.k_elem = torch.ones(k_bins, device=self.k.device)
 
@@ -459,10 +454,6 @@ def update_k(self, x, x_l):
             y = self._tile(x)
             _k_rand = y[torch.randperm(y.shape[0])][:k_bins]
 
-            # dist.broadcast(_k_rand, 0)
-            # dist.all_reduce(_k_sum)
-            # dist.all_reduce(_k_elem)
-
             # Update centres
             old_k = self.k
             self.k_sum = mu * self.k_sum + (1.0 - mu) * _k_sum  # w, k_bins
@@ -749,7 +740,6 @@ def __init__(self, config):
         if multipliers is None:
             self.multipliers = [1] * levels
         else:
-            assert len(multipliers) == levels, "Invalid number of multipliers"
             self.multipliers = multipliers
 
         def _block_kwargs(level):
@@ -830,15 +820,12 @@ def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
 
     def sample(self, n_samples):
         # TODO handle device properly
-
         zs = [torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes]
         return self.decode(zs)
 
     def forward(self, x, hps, loss_fn="l1"):
         metrics = {}
 
-        # N = x.shape[0]
-
         # Encode/Decode
         x_in = self.preprocess(x)
         xs = []
@@ -852,7 +839,6 @@ def forward(self, x, hps, loss_fn="l1"):
         for level in range(self.levels):
             decoder = self.decoders[level]
             x_out = decoder(xs_quantised[level : level + 1], all_levels=False)
-            # assert_shape(x_out, x_in.shape)
             x_outs.append(x_out)
 
         # Loss
@@ -941,7 +927,6 @@ def forward(self, hidden_states):
 
 # TODO rename to JukeboxLayerNorm
 
-
 class LayerNorm(FusedLayerNorm):
     def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
         super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
@@ -1585,28 +1570,8 @@ def __init__(
         self.attn_func = attn_func
 
     def forward(self, x, encoder_kv, sample=False):
-        if sample:
-            a = self.attn(self.ln_0(x), encoder_kv, sample)
-            m = self.mlp(self.ln_1(x + a))
-        else:
-            a = self.attn(self.ln_0(x), encoder_kv, sample)
-            m = self.mlp(self.ln_1(x + a))
-            # if self.attn_func == 6:
-            #     assert encoder_kv is not None
-            #     a = checkpoint(lambda _x,_enc_kv,_s=sample: self.attn(self.ln_0(_x),_enc_kv,_s),
-            #                    (x,encoder_kv),
-            #                    (*self.attn.parameters(), *self.ln_0.parameters()),
-            #                    self.checkpoint_attn == 3)  # 2 recomputes after the projections, and 1 recomputes after head splitting.
-            # else:
-            #     assert encoder_kv is None
-            #     a = checkpoint(lambda _x,_enc_kv=None,_s=sample: self.attn(self.ln_0(_x),_enc_kv,_s),
-            #                    (x,),
-            #                    (*self.attn.parameters(), *self.ln_0.parameters()),
-            #                    self.checkpoint_attn == 3)  # 2 recomputes after the projections, and 1 recomputes after head splitting.
-            # m = checkpoint(lambda _x: self.mlp(self.ln_1(_x)), (x + a,),
-            #                (*self.mlp.parameters(), *self.ln_1.parameters()),
-            #                self.checkpoint_mlp == 1)
-            pass
+        a = self.attn(self.ln_0(x), encoder_kv, sample)
+        m = self.mlp(self.ln_1(x + a))
         if self.res_scale == 1.0:
             h = x + a + m
         else:
@@ -1924,29 +1889,9 @@ def forward(
         with torch.no_grad():
             x = self.preprocess(x)
 
-        N, D = x.shape
-        # assert isinstance(x, torch.cuda.LongTensor)
-        # assert (0 <= x).all() and (x < self.bins).all()
-
-        if self.y_cond:
-            assert y_cond is not None
-            assert y_cond.shape == (N, 1, self.width)
-        else:
-            assert y_cond is None
-
-        if self.x_cond:
-            assert x_cond is not None
-            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
-                N,
-                1,
-                self.width,
-            ), (
-                f"{x_cond.shape} != {(N, D, self.width)} nor {(N, 1, self.width)}. Did you pass the correct"
-                " --sample_length?"
-            )
-        else:
-            assert x_cond is None
-            x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
+        N = x.shape[0]
+        if not self.x_cond:
+           x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
 
         x_t = x  # Target
         x = self.x_emb(x)  # X emb
@@ -1989,17 +1934,12 @@ def forward(
     def get_emb(self, sample_t, n_samples, x, x_cond, y_cond):
         N, D = n_samples, self.input_dims
         if sample_t == 0:
-            # Fill in start token
-            # x = torch.empty(n_samples, 1, self.width).cuda()
             x = torch.empty(n_samples, 1, self.width).to(x_cond.device)
-
             if self.y_cond:
                 x[:, 0] = y_cond.view(N, self.width)
             else:
                 x[:, 0] = self.start_token
         else:
-            # assert isinstance(x, torch.cuda.LongTensor)
-            assert (0 <= x).all() and (x < self.bins).all()
             x = self.x_emb(x)
         assert x.shape == (n_samples, 1, self.width)
         if x_cond.shape == (N, D, self.width):
@@ -3073,13 +3013,13 @@ def get_starts(total_length, n_ctx, hop_length):
     return starts
 
 
-def save_wav(fname, aud, sr):
+def save_wav(fname, lvl, aud, sr):
     import soundfile
 
     # clip before saving?
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
     for i in list(range(aud.shape[0])):
-        soundfile.write(f"{fname}/item_{i}.wav", aud[i], samplerate=sr, format="wav")
+        soundfile.write(f"{fname}/lvl_{lvl}-sample_{i}.wav", aud[i], samplerate=sr, format="wav")
 
 
 def get_alignment(x, zs, labels, prior, level, fp16, hps):
@@ -3191,7 +3131,7 @@ def __init__(self, config):
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
     # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
-    def sample_partial_window(self, zs, labels, sampling_kwargs, level, tokens_to_sample, hps):
+    def sample_partial_window(self, zs, labels, offset, sampling_kwargs, level, tokens_to_sample, hps):
         prior = self.priors[level]
         z = zs[level]
         n_ctx = prior.n_ctx
@@ -3203,10 +3143,10 @@ def sample_partial_window(self, zs, labels, sampling_kwargs, level, tokens_to_sa
             sampling_kwargs["sample_tokens"] = n_ctx
             start = current_tokens - n_ctx + tokens_to_sample
 
-        return self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
+        return self.sample_single_window(zs, labels, offset, sampling_kwargs, level, start, hps)
 
     # Sample a single window of length=n_ctx at position=start on level=level
-    def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
+    def sample_single_window(self, zs, labels, offset, sampling_kwargs, level, start, hps):
         prior = self.priors[level]
         n_samples = hps.n_samples
         n_ctx = prior.n_ctx
@@ -3241,7 +3181,7 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         # if there are no levels above should return None!
 
         # set y offset, sample_length and lyrics okens
-        y = prior.get_y(labels, start, self.total_length,sampling_kwargs.pop('offset') )
+        y = prior.get_y(labels, start, self.total_length,offset)
 
         empty_cache()
         max_batch_size = 2
@@ -3265,16 +3205,16 @@ def sample_single_window(self, zs, labels, sampling_kwargs, level, start, hps):
         return zs
 
     # Sample total_length tokens at level=level with hop_length=hop_length
-    def sample_level(self, zs, labels, sampling_kwargs, level, total_length, hop_length, hps):
+    def sample_level(self, zs, labels, offset, sampling_kwargs, level, total_length, hop_length, hps):
         # print(f"Sampling level {level}")
         print(f"Sampling level {level}")
         prior = self.priors[level]
         if total_length >= prior.n_ctx:
             for start in get_range(get_starts(total_length, prior.n_ctx, hop_length)):
-                zs = self.sample_single_window(zs, labels, sampling_kwargs, level, start, hps)
+                zs = self.sample_single_window(zs, labels, offset, sampling_kwargs, level, start, hps)
 
         else:
-            zs = self.sample_partial_window(zs, labels, sampling_kwargs, level, total_length, hps)
+            zs = self.sample_partial_window(zs, labels, offset, sampling_kwargs, level, total_length, hps)
         return zs
 
     # Sample multiple levels
@@ -3301,8 +3241,7 @@ def _sample(
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-                offset = offset
+                * top_prior.raw_to_tokens
             ),
             dict(
                 temp=0.99,
@@ -3311,8 +3250,7 @@ def _sample(
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-                offset = offset
+                * top_prior.raw_to_tokens
 
             ),
             dict(
@@ -3322,39 +3260,36 @@ def _sample(
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
-                offset = offset
+                * top_prior.raw_to_tokens
 
             ),
         ]
         hps = self.config
-        
+        self.start_time = time.strftime('%Y-%m-%d-%Hh%M')
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
             self.priors[level].to(zs[0].device).eval()
             empty_cache()
-            hps.sample_length = self.total_length
+            hps.sample_length = self.total_length # generated length of the signal
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
-            assert (
-                hps.sample_length % self.priors[level].raw_to_tokens == 0
-            ), f"Expected sample_length {hps.sample_length} to be multiple of {self.priors[level].raw_to_tokens}"
             total_length = hps.sample_length // self.priors[level].raw_to_tokens
             hop_length = int(hps.hop_fraction[-level - 1] * self.priors[level].n_ctx)
 
-            zs = self.sample_level(zs, labels[level], sampling_kwargs[level], level, total_length, hop_length, hps)
+            zs = self.sample_level(zs, labels[level], offset, sampling_kwargs[level], level, total_length, hop_length, hps)
 
+            self.priors[level].to('cpu')
             empty_cache()
             self.vqvae.to(zs[level].device)
             # Decode sample
             with torch.no_grad():
                 x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
             self.vqvae.to('cpu')
-            import time
-            logdir = f"{time.strftime('%Y-%m-%d-%Hh%M')}/level_{level}"
+            
+            logdir = f"{self.start_time}/level_{level}"
             if not os.path.exists(logdir):
                 os.makedirs(logdir)
             torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
-            save_wav(logdir, x, hps.sr)
+            save_wav(logdir,level, x, hps.sr)
             if (
                 alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
             ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):

From 5058108ce2f2ca1c8813b8ad010f2098e6c3cc0c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 21 Jul 2022 07:34:35 +0000
Subject: [PATCH 042/196] remove asserts

---
 .../models/jukebox/modeling_jukebox.py        | 134 ++----------------
 1 file changed, 9 insertions(+), 125 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 1787ef772bbe8..d22962bbb8d97 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1611,7 +1611,6 @@ def __init__(
         self.encoder_dims = encoder_dims
         self.blocks = blocks
         if blocks is not None:
-            assert n_ctx % blocks == 0
             self.block_ctx = n_ctx // blocks
         self.prime_len = prime_len
         self.n_head = n_head
@@ -1963,26 +1962,12 @@ def sample(
         get_preds=False,
         sample_tokens=None,
     ):
-        assert self.training is False
-
         if sample_tokens is None:
             sample_tokens = self.input_dims
         N, D = n_samples, self.input_dims
-        if self.y_cond:
-            assert y_cond is not None
-            assert y_cond.shape == (N, 1, self.width)
-        else:
-            assert y_cond is None
 
-        if self.x_cond:
-            assert x_cond is not None
-            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
-                N,
-                1,
-                self.width,
-            ), f"Got {x_cond.shape}, expected ({N}, {D}/{1}, {self.width})"
-        else:
-            assert x_cond is None
+
+        if not self.x_cond:
             x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(
                 "cpu" if torch.cuda.is_available() else "cpu"
             )
@@ -2044,30 +2029,14 @@ def primed_sample(
         # Preprocess.
         with torch.no_grad():
             x = self.preprocess(x)
-        # assert isinstance(x, torch.cuda.LongTensor)
-        assert (0 <= x).all() and (x < self.bins).all()
-        assert x.shape[0] == n_samples
+
         xs = torch.split(x, 1, dim=1)
         xs = list(xs)
-        assert len(xs) < sample_tokens
 
         N, D = n_samples, self.input_dims
-        if self.y_cond:
-            assert y_cond is not None
-            assert y_cond.shape == (N, 1, self.width)
-        else:
-            assert y_cond is None
 
-        if self.x_cond:
-            assert x_cond is not None
-            assert x_cond.shape == (N, D, self.width) or x_cond.shape == (
-                N,
-                1,
-                self.width,
-            ), f"Got {x_cond.shape}, expected ({N}, {D}/{1}, {self.width})"
-        else:
-            assert x_cond is None
-            x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(x.device)  # .cuda()
+        if not self.x_cond:
+            x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(x.device)
 
         with torch.no_grad():
             if get_preds:
@@ -2094,8 +2063,6 @@ def primed_sample(
                 start = start + current_chunk_size
 
                 x_prime, cond_prime = torch.cat(xs_prime, dim=1), torch.cat(conds_prime, dim=1)
-                assert x_prime.shape == (n_samples, current_chunk_size, self.width)
-                assert cond_prime.shape == (n_samples, current_chunk_size, self.width)
                 del xs_prime
                 del conds_prime
                 if not get_preds:
@@ -2105,7 +2072,6 @@ def primed_sample(
                 if get_preds:
                     if self.add_cond_after_transformer:
                         x_prime = x_prime + cond_prime
-                    assert x_prime.shape == (n_samples, current_chunk_size, self.width)
                     del cond_prime
                     x_primes.append(x_prime)
                 else:
@@ -2113,7 +2079,6 @@ def primed_sample(
 
             if get_preds:
                 x_prime = torch.cat(x_primes, dim=1)
-                assert x_prime.shape == (n_samples, len(xs), self.width)
                 x_prime = self.x_out(x_prime)  # Predictions
                 preds.append(x_prime)
 
@@ -2406,14 +2371,7 @@ def __init__(
             )
 
     def forward(self, y):
-        assert len(y.shape) == 2, f"Expected shape with 2 dims, got {y.shape}"
-        assert (
-            y.shape[-1] == 4 + self.max_bow_genre_size
-        ), f"Expected shape (N,{4 + self.max_bow_genre_size}), got {y.shape}"
-        # assert isinstance(y, torch.cuda.LongTensor), f"Expected dtype {t.cuda.LongTensor}, got {y.dtype}"
-        # N = y.shape[0]
         total_length, offset, length, artist, genre = y[:, 0:1], y[:, 1:2], y[:, 2:3], y[:, 3:4], y[:, 4:]
-
         # Start embedding of length 1
         artist_emb = self.artist_emb(artist)
         # Empty genre slots are denoted by -1. We mask these out.
@@ -2431,7 +2389,6 @@ def forward(self, y):
                 + self.absolute_pos_emb(start, end)
                 + self.relative_pos_emb(start / total_length, end / total_length)
             )
-            # assert_shape(pos_emb, (N, self.n_time, self.out_width))
         else:
             pos_emb = None
         return start_emb, pos_emb
@@ -2683,20 +2640,12 @@ def get_z_conds(self, zs, start, end):
     def prior_preprocess(self, xs, conds):
         N = xs[0].shape[0]
         for i in range(len(xs)):
-            x, _, dims = xs[i], self.prior_shapes[i], self.prior_dims[i]
-            bins, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])
-            # assert isinstance(x, torch.cuda.LongTensor), x
-            assert (0 <= x).all() and (x < bins).all()
-            # assert_shape(x, (N, *shape))
-            xs[i] = (xs[i] + bins_shift).view(N, -1)
+            xs[i] = (xs[i] + int(self.prior_bins_shift[i])).view(N, -1)
 
         for i in range(len(conds)):
             cond, _, dims = conds[i], self.prior_shapes[i], self.prior_dims[i]
-            if cond is not None:
-                # assert_shape(cond, (N, dims, self.prior_width))
-                pass
-            else:
-                conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=xs[0].device)
+            if cond is  None:
+               conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=xs[0].device)
 
         return torch.cat(xs, dim=1), torch.cat(conds, dim=1)
 
@@ -2707,8 +2656,7 @@ def prior_postprocess(self, z):
         xs = list(torch.split(z, dims, dim=1))
 
         for i in range(len(xs)):
-            # x, shape, dims, bins, bins_shift = xs[i], self.prior_shapes[i], self.prior_dims[i], self.prior_bins[i], self.prior_bins_shift[i]
-            # assert_shape(x, (N, dims))
+
             shape = self.prior_shapes[i]
             _, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])  # bins, -> _,
             # xs[i] = (xs[i] - bins_shift).view(N, *shape) #view(N, -1, *shape[1:])
@@ -2716,15 +2664,11 @@ def prior_postprocess(self, z):
             xs[i] = torch.clamp(
                 xs[i], min=0
             )  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
-            # assert (xs[i] < bins).all(), f'rank: {dist.get_rank()}, bins: {bins}, dims {dims}, shape {shape}, prior_shape {self.prior_shapes}, bins_shift {bins_shift}, xs[i]: {xs[i]}'
 
         return xs[-1]
 
     def x_emb(self, z_conds):
         z_conds = z_conds[: self.cond_level - self.level]
-        assert (
-            len(z_conds) == len(self.conditioner_blocks) == self.cond_level - self.level
-        ), f"Expected {len(z_conds)} == {len(self.conditioner_blocks)} == {self.cond_level} - {self.level}"
         x_cond = None
         for z_cond, conditioner_block in reversed(list(zip(z_conds, self.conditioner_blocks))):
             x_cond = conditioner_block(z_cond, x_cond)
@@ -2747,18 +2691,12 @@ def decode(self, zs, start_level=None, end_level=None, bs_chunks=1):
             start_level = self.level
         if end_level is None:
             end_level = self.levels
-
-        assert len(zs) == end_level - start_level
         with torch.no_grad():
             x_out = self.decoder(zs, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
         return x_out
 
     def get_cond(self, z_conds, y):
         if y is not None:
-            # assert (
-            #     y.shape[1] == 4 + self.y_emb.max_bow_genre_size + self.n_tokens
-            # ), f"Expected {4} + {self.y_emb.max_bow_genre_size} + {self.n_tokens}, got {y.shape[1]}"
-            # removed the labeler so there are no y_emb
             n_labels = y.shape[1] - self.n_tokens
             y, prime = y[:, :n_labels], y[:, n_labels:]
         else:
@@ -2780,15 +2718,6 @@ def sample(
         chunk_size=None,
         sample_tokens=None,
     ):
-        N = n_samples
-        if z is not None:
-            assert z.shape[0] == N, f"Expected shape ({N},**), got shape {z.shape}"
-        if y is not None:
-            assert y.shape[0] == N, f"Expected shape ({N},**), got shape {y.shape}"
-        if z_conds is not None:
-            for z_cond in z_conds:
-                assert z_cond.shape[0] == N, f"Expected shape ({N},**), got shape {z_cond.shape}"
-
         no_past_context = z is None or z.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
         print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
@@ -2797,7 +2726,6 @@ def sample(
             # Currently x_cond only uses immediately above layer
             x_cond, y_cond, prime = self.get_cond(z_conds, y)
             if self.single_enc_dec:
-                # assert chunk_size % self.prime_loss_dims == 0. TODO: Check if needed
                 if no_past_context:
                     z, x_cond = self.prior_preprocess([prime], [None, x_cond])
                 else:
@@ -2845,23 +2773,14 @@ def sample(
                         chunk_size=chunk_size,
                         sample_tokens=sample_tokens,
                     )
-            if sample_tokens is None:
-                # assert_shape(z, (N, *self.z_shape))
-                pass
         return z
 
     def get_encoder_kv(self, prime, fp16=False, sample=False):
         if self.n_tokens != 0 and self.use_tokens:
             if sample:
                 self.prime_prior = self.prime_prior.to(prime.device)
-                # self.prime_prior.cuda()
-                pass
-            # N = prime.shape[0]
             prime_acts = self.prime_prior(prime, None, None, None, fp16=fp16)
-            # assert_shape(prime_acts, (N, self.prime_loss_dims, self.prime_acts_width))
-            assert prime_acts.dtype == torch.float, f"Expected  torch.float, got {prime_acts.dtype}"
             encoder_kv = self.prime_state_ln(self.prime_state_proj(prime_acts))
-            assert encoder_kv.dtype == torch.float, f"Expected  torch.float, got {encoder_kv.dtype}"
             if sample:
                 self.prime_prior.cpu()
                 if fp16:
@@ -2888,7 +2807,6 @@ def z_forward(self, z, z_conds=[], y=None, fp16=False, get_preds=False, get_attn
                 self-attention softmaxes to self.prior.transformer.ws. Either a set of layer indices indicating which
                 layers to store, or a boolean value indicating whether to dump all.
         """
-        assert isinstance(get_attn_weights, (bool, set))
         if get_attn_weights:
             self.prior.transformer.set_record_attn(get_attn_weights)
         x_cond, y_cond, prime = self.get_cond(z_conds, y)
@@ -2941,37 +2859,6 @@ class JukeboxPreTrainedModel(PreTrainedModel):
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
-    # def _init_weights(self, module):
-    #     """Initialize the weights."""
-    #     if isinstance(module, (nn.Linear, Conv1D)):
-    #         # Slightly different from the TF version which uses truncated_normal for initialization
-    #         # cf https://github.com/pytorch/pytorch/pull/5617
-    #         module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-    #         if module.bias is not None:
-    #             module.bias.data.zero_()
-    #     elif isinstance(module, nn.Embedding):
-    #         module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-    #         if module.padding_idx is not None:
-    #             module.weight.data[module.padding_idx].zero_()
-    #     elif isinstance(module, nn.LayerNorm):
-    #         module.bias.data.zero_()
-    #         module.weight.data.fill_(1.0)
-
-    #     # Reinitialize selected weights subject to the Jukebox Paper Scheme:
-    #     #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-    #     #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-    #     #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-    #     #
-    #     # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-    #     for name, p in module.named_parameters():
-    #         if "c_proj" in name and "weight" in name:
-    #             # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-    #             p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
-
-    # def _set_gradient_checkpointing(self, module, value=False):
-    #     if isinstance(module, JukeboxModel):
-    #         module.gradient_checkpointing = value
-
 
 JUKEBOX_START_DOCSTRING = r"""
 
@@ -3046,7 +2933,6 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
 
         # set y offset, sample_length and lyrics tokens
         y, indices_hop = prior.get_y(labels, start, total_length, get_indices=True)
-        # assert len(indices_hop) == bs
         for indices in indices_hop:
             assert len(indices) == n_tokens
 
@@ -3060,9 +2946,7 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
             del w_hop
         w = torch.cat(w_hops, dim=0)
         del w_hops
-        # assert_shape(w, (bs, n_ctx, n_tokens))
         alignment_hop = w.float().cpu().numpy()
-        # assert_shape(alignment_hop, (bs, n_ctx, n_tokens))
         del w
 
         # alignment_hop has shape (bs, n_ctx, n_tokens)

From 281fa9eeab23cc3b4fb4ecf0b05ce063f59a2b82 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 22 Jul 2022 09:07:12 +0000
Subject: [PATCH 043/196] update code for multiple samples

---
 src/transformers/models/jukebox/modeling_jukebox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index d22962bbb8d97..09df74f909564 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2614,7 +2614,7 @@ def set_y_lyric_tokens(self, labels):
         # assert ys.shape[0] == len(labels)
         if self.n_tokens > 0:
             # total_length, offset, duration):
-            tokens_list = torch.zeros((1, self.n_tokens), dtype=torch.long, device=labels.device)
+            tokens_list = torch.zeros((labels.shape[0], self.n_tokens), dtype=torch.long, device=labels.device)
             indices_list = []  # whats the index of each current character in original array
             for i in range(labels.shape[0]):
                 full_tokens = labels.clone()[:, 4 + self.y_emb.max_bow_genre_size :]

From 16af74fdae834d1eefe068fa7f86ae69e8a02e5f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 22 Jul 2022 14:39:26 +0000
Subject: [PATCH 044/196] Add sample level argument

---
 .../models/jukebox/modeling_jukebox.py             | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 09df74f909564..e93851cef0c38 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -53,6 +53,7 @@
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "ArthurZ/jukebox-dummy",
     "ArthurZ/jukebox-1b-lyrics",
+    "ArthurZ/jukebox-5b-lyrics",
     # See all Jukebox models at https://huggingface.co/models?filter=jukebox
 ]
 
@@ -3092,9 +3093,8 @@ def sample_single_window(self, zs, labels, offset, sampling_kwargs, level, start
     def sample_level(self, zs, labels, offset, sampling_kwargs, level, total_length, hop_length, hps):
         # print(f"Sampling level {level}")
         print(f"Sampling level {level}")
-        prior = self.priors[level]
-        if total_length >= prior.n_ctx:
-            for start in get_range(get_starts(total_length, prior.n_ctx, hop_length)):
+        if total_length >= self.priors[level].n_ctx:
+            for start in get_range(get_starts(total_length, self.priors[level].n_ctx, hop_length)):
                 zs = self.sample_single_window(zs, labels, offset, sampling_kwargs, level, start, hps)
 
         else:
@@ -3120,7 +3120,7 @@ def _sample(
         sampling_kwargs = [
             dict(
                 temp=0.99,
-                fp16=False,
+                fp16=True,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -3129,7 +3129,7 @@ def _sample(
             ),
             dict(
                 temp=0.99,
-                fp16=False,
+                fp16=True,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -3139,7 +3139,7 @@ def _sample(
             ),
             dict(
                 temp=sampling_temperature,
-                fp16=False,
+                fp16=True,
                 max_batch_size=max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -3184,7 +3184,7 @@ def _sample(
 
     # Generate ancestral samples given a list of artists and genres
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
-        sample_levels = list(range(len(self.priors)))
+        sample_levels =  sampling_kwargs.pop('sample_levels',list(range(len(self.priors))))
         zs = [torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))]
         zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs

From 56c0207296acaa55983754c936d08af45e1ffb97 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 22 Jul 2022 14:40:06 +0000
Subject: [PATCH 045/196] no fp16 for now

---
 src/transformers/models/jukebox/modeling_jukebox.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index e93851cef0c38..84c092b310ec6 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -3120,7 +3120,7 @@ def _sample(
         sampling_kwargs = [
             dict(
                 temp=0.99,
-                fp16=True,
+                fp16=False,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -3129,7 +3129,7 @@ def _sample(
             ),
             dict(
                 temp=0.99,
-                fp16=True,
+                fp16=False,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -3139,7 +3139,7 @@ def _sample(
             ),
             dict(
                 temp=sampling_temperature,
-                fp16=True,
+                fp16=False,
                 max_batch_size=max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,

From 29ec9abec25dff63bfe203870328827010cc60ef Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 22 Jul 2022 14:41:20 +0000
Subject: [PATCH 046/196] style

---
 .../models/jukebox/modeling_jukebox.py        | 43 ++++++++++---------
 .../models/jukebox/tokenization_jukebox.py    |  6 +--
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 84c092b310ec6..dff1f874a7339 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -17,6 +17,7 @@
 
 import math
 import os
+import time
 
 import numpy as np
 import torch
@@ -24,7 +25,7 @@
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
-import time
+
 
 if version.parse(torch.__version__) >= version.parse("1.6"):
     is_amp_available = True
@@ -928,6 +929,7 @@ def forward(self, hidden_states):
 
 # TODO rename to JukeboxLayerNorm
 
+
 class LayerNorm(FusedLayerNorm):
     def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
         super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
@@ -1891,7 +1893,7 @@ def forward(
 
         N = x.shape[0]
         if not self.x_cond:
-           x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
+            x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
 
         x_t = x  # Target
         x = self.x_emb(x)  # X emb
@@ -1967,7 +1969,6 @@ def sample(
             sample_tokens = self.input_dims
         N, D = n_samples, self.input_dims
 
-
         if not self.x_cond:
             x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(
                 "cpu" if torch.cuda.is_available() else "cpu"
@@ -2601,7 +2602,7 @@ def get_y(self, labels, start, total_length, offset, get_indices=False):
         y[:, 2] = int(self.sample_length)
 
         # Set offset
-        y[:, 1:2] = int(offset* self.raw_to_tokens) + int(start * self.raw_to_tokens)
+        y[:, 1:2] = int(offset * self.raw_to_tokens) + int(start * self.raw_to_tokens)
         # here since y has the full token_list, ze just need to selected the ones that are relevant
 
         # Set lyric tokens
@@ -2645,8 +2646,8 @@ def prior_preprocess(self, xs, conds):
 
         for i in range(len(conds)):
             cond, _, dims = conds[i], self.prior_shapes[i], self.prior_dims[i]
-            if cond is  None:
-               conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=xs[0].device)
+            if cond is None:
+                conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=xs[0].device)
 
         return torch.cat(xs, dim=1), torch.cat(conds, dim=1)
 
@@ -3066,7 +3067,7 @@ def sample_single_window(self, zs, labels, offset, sampling_kwargs, level, start
         # if there are no levels above should return None!
 
         # set y offset, sample_length and lyrics okens
-        y = prior.get_y(labels, start, self.total_length,offset)
+        y = prior.get_y(labels, start, self.total_length, offset)
 
         empty_cache()
         max_batch_size = 2
@@ -3125,7 +3126,7 @@ def _sample(
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens
+                * top_prior.raw_to_tokens,
             ),
             dict(
                 temp=0.99,
@@ -3134,8 +3135,7 @@ def _sample(
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens
-
+                * top_prior.raw_to_tokens,
             ),
             dict(
                 temp=sampling_temperature,
@@ -3144,36 +3144,37 @@ def _sample(
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
                 total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens
-
+                * top_prior.raw_to_tokens,
             ),
         ]
         hps = self.config
-        self.start_time = time.strftime('%Y-%m-%d-%Hh%M')
+        self.start_time = time.strftime("%Y-%m-%d-%Hh%M")
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
             self.priors[level].to(zs[0].device).eval()
             empty_cache()
-            hps.sample_length = self.total_length # generated length of the signal
+            hps.sample_length = self.total_length  # generated length of the signal
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
             total_length = hps.sample_length // self.priors[level].raw_to_tokens
             hop_length = int(hps.hop_fraction[-level - 1] * self.priors[level].n_ctx)
 
-            zs = self.sample_level(zs, labels[level], offset, sampling_kwargs[level], level, total_length, hop_length, hps)
+            zs = self.sample_level(
+                zs, labels[level], offset, sampling_kwargs[level], level, total_length, hop_length, hps
+            )
 
-            self.priors[level].to('cpu')
+            self.priors[level].to("cpu")
             empty_cache()
             self.vqvae.to(zs[level].device)
             # Decode sample
             with torch.no_grad():
                 x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
-            self.vqvae.to('cpu')
-            
+            self.vqvae.to("cpu")
+
             logdir = f"{self.start_time}/level_{level}"
             if not os.path.exists(logdir):
                 os.makedirs(logdir)
             torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
-            save_wav(logdir,level, x, hps.sr)
+            save_wav(logdir, level, x, hps.sr)
             if (
                 alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
             ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):
@@ -3184,7 +3185,7 @@ def _sample(
 
     # Generate ancestral samples given a list of artists and genres
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
-        sample_levels =  sampling_kwargs.pop('sample_levels',list(range(len(self.priors))))
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         zs = [torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))]
         zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
@@ -3207,6 +3208,6 @@ def primed_sample(self, x, labels, **sampling_kwargs):
         self.vqvae.to(x.device)
         with torch.no_grad():
             zs = self.vqvae.encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
-        self.vqvae.to('cpu')
+        self.vqvae.to("cpu")
         zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 99ae08720c78b..a3c0a0efb25bb 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -213,12 +213,12 @@ def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
             duration (`_type_`):
                 _description_
         """
-        artists_id = [self.artists_encoder.get(artist,0) for artist in list_artists]
+        artists_id = [self.artists_encoder.get(artist, 0) for artist in list_artists]
         for genres in range(len(list_genres)):
-            list_genres[genres] = [self.genres_encoder.get(genre,0) for genre in list_genres[genres]]
+            list_genres[genres] = [self.genres_encoder.get(genre, 0) for genre in list_genres[genres]]
             list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
 
-        lyric_ids = [[], [], [self.lyrics_encoder.get(character,0) for character in list_lyrics[-1]]]
+        lyric_ids = [[], [], [self.lyrics_encoder.get(character, 0) for character in list_lyrics[-1]]]
         return artists_id, list_genres, lyric_ids
 
     def _tokenize(self, lyrics):

From 0cdf8cb5b08e9ed1cc86f51c99c354a3490ea533 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 22 Jul 2022 15:59:55 +0000
Subject: [PATCH 047/196] accelerate support?

---
 src/transformers/models/jukebox/modeling_jukebox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index dff1f874a7339..9d23aa8db683a 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -3006,7 +3006,7 @@ def load_prompts(audio_files, duration, hps):
 )
 class JukeboxModel(JukeboxPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
+    _no_split_modules = ["JukeboxBlock"]
     def __init__(self, config):
         super().__init__(config)
 

From dc1f57af3a34921766816bf2248a893961562cf5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 25 Jul 2022 09:37:09 +0000
Subject: [PATCH 048/196] update and quality

---
 .../models/jukebox/modeling_jukebox.py        |  21 +-
 .../models/jukebox/tokenization_jukebox.py    |  24 +-
 tests/models/jukebox/test_modeling_jukebox.py |  92 -----
 .../jukebox/test_tokenization_jukebox.py      | 368 +++++++++---------
 4 files changed, 209 insertions(+), 296 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 9d23aa8db683a..8563ff342ac0d 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -423,7 +423,7 @@ def _tile(self, x):
     def init_k(self, x):
         # TODO rename x to a way more meaningful name
 
-        emb_width, k_bins = self.emb_width, self.k_bins  # mu,
+        _, k_bins = self.emb_width, self.k_bins  # mu,
         self.init = True
         # init k_w using random vectors from x
         y = self._tile(x)
@@ -1967,7 +1967,7 @@ def sample(
     ):
         if sample_tokens is None:
             sample_tokens = self.input_dims
-        N, D = n_samples, self.input_dims
+        N, _ = n_samples, self.input_dims
 
         if not self.x_cond:
             x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(
@@ -2035,7 +2035,7 @@ def primed_sample(
         xs = torch.split(x, 1, dim=1)
         xs = list(xs)
 
-        N, D = n_samples, self.input_dims
+        N, _ = n_samples, self.input_dims
 
         if not self.x_cond:
             x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(x.device)
@@ -2902,13 +2902,16 @@ def get_starts(total_length, n_ctx, hop_length):
     return starts
 
 
-def save_wav(fname, lvl, aud, sr):
+def save_wav(fname, lvl, metas, aud, sr):
     import soundfile
 
+    artists, genres, lyrics = metas.values()
     # clip before saving?
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
     for i in list(range(aud.shape[0])):
-        soundfile.write(f"{fname}/lvl_{lvl}-sample_{i}.wav", aud[i], samplerate=sr, format="wav")
+        soundfile.write(
+            f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav", aud[i], samplerate=sr, format="wav"
+        )
 
 
 def get_alignment(x, zs, labels, prior, level, fp16, hps):
@@ -3007,6 +3010,7 @@ def load_prompts(audio_files, duration, hps):
 class JukeboxModel(JukeboxPreTrainedModel):
     _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
     _no_split_modules = ["JukeboxBlock"]
+
     def __init__(self, config):
         super().__init__(config)
 
@@ -3107,6 +3111,7 @@ def _sample(
         self,
         zs,
         labels,
+        metas,
         sample_levels,
         chunk_size=32,
         sampling_temperature=0.98,
@@ -3174,7 +3179,7 @@ def _sample(
             if not os.path.exists(logdir):
                 os.makedirs(logdir)
             torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
-            save_wav(logdir, level, x, hps.sr)
+            save_wav(logdir, level, metas, x, hps.sr)
             if (
                 alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
             ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):
@@ -3184,10 +3189,10 @@ def _sample(
         return zs
 
     # Generate ancestral samples given a list of artists and genres
-    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
+    def ancestral_sample(self, labels, metas, n_samples=1, **sampling_kwargs):
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         zs = [torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))]
-        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
+        zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
         return zs
 
     # Continue ancestral sampling from previously saved codes
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index a3c0a0efb25bb..2f9fd8eff732b 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -75,7 +75,10 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
     """
     full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
-        tokens = [0] * (max_n_lyric_tokens - len(full_tokens)) + full_tokens
+        tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens)), full_tokens])
+        # tokens = torch.cat([0] * (max_n_lyric_tokens - len(full_tokens)), full_tokens)
+        # did not handle that before but now the full_tokens are torch tensors
+        # because the tokenizer outputs tensors and not list (choice ma)
         indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
     else:
         assert 0 <= offset < total_length
@@ -243,13 +246,12 @@ def tokenize(self, artist, genre, lyrics, **kwargs):
             lyrics (`_type_`):
                 _description_
         """
-        artist, genre, lyrics, kwargs = self.prepare_for_tokenization(artist, genre, lyrics, **kwargs)
-        # TODO deal with the kwargs here
+        artist, genre, lyrics = self.prepare_for_tokenization(artist, genre, lyrics)
         lyrics = self._tokenize(lyrics)
         return artist, genre, lyrics
 
     def prepare_for_tokenization(
-        self, artists: str, genres: str, lyrics: str, is_split_into_words: bool = False, **kwargs
+        self, artists: str, genres: str, lyrics: str, is_split_into_words: bool = False
     ) -> Tuple[str, str, str, Dict[str, Any]]:
         """
         Performs any necessary transformations before tokenization.
@@ -269,10 +271,10 @@ def prepare_for_tokenization(
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
             kwargs:
-                Keyword arguments to use for the tokenization. #TODO v3 could be handled here
+                Keyword arguments to use for the tokenization.
 
         Returns:
-            `Tuple[str, str, str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+            `Tuple[str, Union[List[str]|str], str, Dict[str, Any]]`:
         """
         for idx in range(len(self.version)):
             if self.version[idx] == "v3":
@@ -280,11 +282,11 @@ def prepare_for_tokenization(
                 genres[idx] = [genres[idx].lower()]
             else:
                 artists[idx] = self._normalize(artists[idx]) + ".v2"
-                genres[idx] = self._normalize(genres[idx] + ".v2").split(
-                    "_"
-                )  # split is for the full dictionnary with combined genres
+                genres[idx] = [
+                    self._normalize(genre) + ".v2" for genre in genres[idx].split("_")
+                ]  # split is for the full dictionnary with combined genres
 
-        if self.version[idx] == "v2":
+        if self.version[-1] == "v2":
             self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
             vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
             self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
@@ -300,7 +302,7 @@ def prepare_for_tokenization(
         lyrics = normalizer.normalize_str(lyrics)
         lyrics = lyrics.replace("\\", "\n")
         lyrics = [], [], self.out_of_vocab.sub("", lyrics)
-        return artists, genres, lyrics, kwargs
+        return artists, genres, lyrics
 
     def _normalize(self, text: str) -> str:
         """Normalizes the input text. This process is for the genres and the artit
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 99a0eb25d3f4e..f06ac25ca0e10 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -222,98 +222,6 @@ def test_vqvae(self):
         pass
 
 
-@require_torch
-class JukeboxDummyModelTest(unittest.TestCase):
-    all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-
-    metas = dict(
-        artist="Zac Brown Band",
-        genres="Country",
-        lyrics="""I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone
-    Stand in the desert. . . . Near them, on the sand,
-    Half sunk a shattered visage lies, whose frown,
-    And wrinkled lip, and sneer of cold command,
-    Tell that its sculptor well those passions read
-    Which yet survive, stamped on these lifeless things,
-    The hand that mocked them, and the heart that fed;
-    And on the pedestal, these words appear:
-    My name is Ozymandias, King of Kings;
-    Look on my Works, ye Mighty, and despair!
-    Nothing beside remains. Round the decay
-    Of that colossal Wreck, boundless and bare
-    The lone and level sands stretch far away
-    """,
-    )
-    # fmt: off
-    top_50_expected_zs = torch.tensor(
-        [
-            33, 90, 94, 17, 88, 88, 31, 65, 127, 112, 26, 58, 107, 5,
-            89, 53, 80, 48, 98, 68, 1, 33, 80, 80, 126, 2, 53, 8,
-            16, 45, 35, 64, 75, 10, 16, 11, 65, 39, 85, 17, 112, 44,
-            68, 63, 16, 127, 35, 90, 51, 27
-        ]
-    )
-    expected_samples = torch.Tensor(
-        [
-            [
-                121, 67, 16, 111, 54, 84, 0, 0, 41, 0, 14, 0, 0, 49,
-                20, 12, 5, 0, 58, 83, 0, 61, 0, 29, 0, 36, 42, 62,
-                75, 0, 88, 51, 0, 0, 20, 110, 39, 20, 85, 0, 0, 0,
-                76, 0, 32, 17, 99, 0, 127, 103, 78, 0, 0, 125, 82, 0,
-                38, 74, 0, 41, 38, 0, 0, 127, 45, 0, 2, 99, 0, 88,
-                84, 86, 5, 70, 0, 0, 0, 0, 23, 0, 0, 5, 0, 0,
-                3, 28, 47, 1, 32, 0, 9, 98, 111, 0, 66, 0, 0, 0,
-                59, 48, 0, 123, 61, 37, 13, 121, 24, 122, 101, 0, 68, 13,
-                31, 0, 57, 0, 24, 13, 85, 0, 0, 68, 0, 105, 0, 105,
-                0, 50, 0, 0, 64, 0, 14, 103, 0, 0, 0, 77, 26, 33,
-                0, 79, 55, 57, 0, 37, 0, 0, 79, 53, 0, 111, 83, 58,
-                41, 70, 1, 28, 109, 56, 0, 98, 80, 0, 100, 62, 126, 0,
-                0, 23, 0, 0, 43, 114, 23, 44, 0, 68, 53, 0, 0, 84,
-                0, 0, 0, 4, 123, 0, 0, 99, 36, 78, 0, 0, 45, 16,
-                75, 111, 95, 62, 36, 0, 52, 92, 33, 71, 3, 0, 110, 0,
-                0, 0, 124, 0, 0, 0, 2, 0, 101, 125, 0, 0, 0, 3,
-                0, 0, 123, 0, 0, 85, 0, 99, 0, 36, 107, 77, 0, 4,
-                41, 73, 0, 66, 43, 19, 0, 0, 124, 0, 55, 32, 0, 0,
-                0, 0, 90, 96
-            ]
-        ]
-    )
-    top_50_expected_zs = torch.tensor(
-        [
-            33, 90, 94, 17, 88, 88, 31, 65, 127, 112, 26, 58, 107, 5,
-            89, 53, 80, 48, 98, 68, 1, 33, 80, 80, 126, 2, 53, 8,
-            16, 45, 35, 64, 75, 10, 16, 11, 65, 39, 85, 17, 112, 44,
-            68, 63, 16, 127, 35, 90, 51, 27
-        ]
-    )
-    # fmt: on
-
-    def test_model(self):
-        set_seed(0)
-        # model = JukeboxModel.from_pretrained("ArthurZ/jukebox-dummy", cond_res_scale= [False,False,False]).eval()
-        # tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
-        # tokens = tokenizer(
-        #     "Alan Jackson",
-        #     "rock",
-        #     "old town road",
-        #     total_length=model.config.sample_length_in_seconds * model.config.sr,
-        # )
-        # sample = model.priors[2].sample(1, y=torch.Tensor([[44100.0, 0, 44100.0] + 514 * [0]]).long(), chunk_size=32)
-        # self.assertTrue(np.allclose(sample, self.expected_samples))
-
-        # with torch.no_grad():
-        #     x = model.vqvae.decode([sample], start_level=1, end_level=2, bs_chunks=sample.shape[0])
-        # first_100 = x.squeeze(-1)[0][0:100]
-        # self.assertTrue(torch.allclose(first_100, self.expected_x, atol=1e-4))
-
-        # inputs, _ = tokens["input_ids"], tokens["attention_masks"]
-        # start = timeit.default_timer()
-        # zs = model.ancestral_sample(inputs, chunk_size=32)
-        # print(f"time to sample : {timeit.default_timer() - start}")
-        # self.assertTrue(torch.allclose(zs[0][0][0:50], self.top_50_expected_zs.long(), atol=1e-4))
-
-
 if __name__ == "__main__":
     tester = Jukebox5bModelTester()
     tester.test_1b_lyrics()
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
index 83ea3509c95cd..74161be6794ab 100644
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,199 +13,197 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-# import json
-# import os
-# import unittest
-
-# from transformers import JukeboxTokenizer, JukeboxTokenizerFast
-# from transformers.models.jukebox.tokenization_jukebox import VOCAB_FILES_NAMES
-# from transformers.testing_utils import require_tokenizers
-
 import unittest
 
-# from ..test_tokenization_common import TokenizerTesterMixin
 from transformers import JukeboxTokenizer
-
-
-class JukeBoxIntegrationTest(unittest.TestCase):
-
-    # @slow
-    def test_tokenizer(self):
+from transformers.testing_utils import require_torch
+
+
+class JukeboxTokenizationTest(unittest.TestCase):
+    tokenizer_class = JukeboxTokenizer
+    metas = dict(
+        artist="Zac Brown Band",
+        genres="Country",
+        lyrics="""I met a traveller from an antique land,
+        Who said "Two vast and trunkless legs of stone
+        Stand in the desert. . . . Near them, on the sand,
+        Half sunk a shattered visage lies, whose frown,
+        And wrinkled lip, and sneer of cold command,
+        Tell that its sculptor well those passions read
+        Which yet survive, stamped on these lifeless things,
+        The hand that mocked them, and the heart that fed;
+        And on the pedestal, these words appear:
+        My name is Ozymandias, King of Kings;
+        Look on my Works, ye Mighty, and despair!
+        Nothing beside remains. Round the decay
+        Of that colossal Wreck, boundless and bare
+        The lone and level sands stretch far away
+        """,
+    )
+
+    @require_torch
+    def test_1b_lyrics_tokenizer(self):
         """
         how to run the same test with openAI
         ...
         """
+        import torch
 
-        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox")
-        tokenizer.max_n_lyric_tokens = 20
-        tokens = tokenizer("Alan Jackson", "rock", "old town road", 4 * 60 * 44100, 8192 * 8 * 4 * 4, 0)
-        inputs, attention_masks = tokens["input_ids"]["y"], tokens["attention_masks"]
+        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox-1b-lyrics")
+        tokens = tokenizer(**self.metas)["input_ids"]
+        # fmt: off
         EXPECTED_OUTPUT = [
-            10584000,
-            0,
-            1048576,
-            145,
-            8,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            41,
-            38,
-            30,
-            77,
-            46,
-            41,
-            49,
-            40,
-            77,
-            44,
-            41,
-            27,
-            30,
+            torch.tensor([[0, 0, 0, 1069, 11]]),
+            torch.tensor([[0, 0, 0, 1069, 11]]),
+            torch.tensor([[
+                0, 0, 0, 7169, 507, 9, 76, 39, 31, 46, 76, 27,
+                76, 46, 44, 27, 48, 31, 38, 38, 31, 44, 76, 32,
+                44, 41, 39, 76, 27, 40, 76, 27, 40, 46, 35, 43,
+                47, 31, 76, 38, 27, 40, 30, 64, 78, 76, 76, 76,
+                76, 76, 76, 76, 76, 23, 34, 41, 76, 45, 27, 35,
+                30, 76, 71, 20, 49, 41, 76, 48, 27, 45, 46, 76,
+                27, 40, 30, 76, 46, 44, 47, 40, 37, 38, 31, 45,
+                45, 76, 38, 31, 33, 45, 76, 41, 32, 76, 45, 46,
+                41, 40, 31, 78, 76, 76, 76, 76, 76, 76, 76, 76,
+                19, 46, 27, 40, 30, 76, 35, 40, 76, 46, 34, 31,
+                76, 30, 31, 45, 31, 44, 46, 63, 76, 63, 76, 63,
+                76, 63, 76, 14, 31, 27, 44, 76, 46, 34, 31, 39,
+                64, 76, 41, 40, 76, 46, 34, 31, 76, 45, 27, 40,
+                30, 64, 78, 76, 76, 76, 76, 76, 76, 76, 76, 8,
+                27, 38, 32, 76, 45, 47, 40, 37, 76, 27, 76, 45,
+                34, 27, 46, 46, 31, 44, 31, 30, 76, 48, 35, 45,
+                27, 33, 31, 76, 38, 35, 31, 45, 64, 76, 49, 34,
+                41, 45, 31, 76, 32, 44, 41, 49, 40, 64, 78, 76,
+                76, 76, 76, 76, 76, 76, 76, 1, 40, 30, 76, 49,
+                44, 35, 40, 37, 38, 31, 30, 76, 38, 35, 42, 64,
+                76, 27, 40, 30, 76, 45, 40, 31, 31, 44, 76, 41,
+                32, 76, 29, 41, 38, 30, 76, 29, 41, 39, 39, 27,
+                40, 30, 64, 78, 76, 76, 76, 76, 76, 76, 76, 76,
+                20, 31, 38, 38, 76, 46, 34, 27, 46, 76, 35, 46,
+                45, 76, 45, 29, 47, 38, 42, 46, 41, 44, 76, 49,
+                31, 38, 38, 76, 46, 34, 41, 45, 31, 76, 42, 27,
+                45, 45, 35, 41, 40, 45, 76, 44, 31, 27, 30, 78,
+                76, 76, 76, 76, 76, 76, 76, 76, 23, 34, 35, 29,
+                34, 76, 51, 31, 46, 76, 45, 47, 44, 48, 35, 48,
+                31, 64, 76, 45, 46, 27, 39, 42, 31, 30, 76, 41,
+                40, 76, 46, 34, 31, 45, 31, 76, 38, 35, 32, 31,
+                38, 31, 45, 45, 76, 46, 34, 35, 40, 33, 45, 64,
+                78, 76, 76, 76, 76, 76, 76, 76, 76, 20, 34, 31,
+                76, 34, 27, 40, 30, 76, 46, 34, 27, 46, 76, 39,
+                41, 29, 37, 31, 30, 76, 46, 34, 31, 39, 64, 76,
+                27, 40, 30, 76, 46, 34, 31, 76, 34, 31, 27, 44,
+                46, 76, 46, 34, 27, 46, 76, 32, 31, 30, 66, 78,
+                76, 76, 76, 76, 76, 76, 76, 76, 1, 40, 30, 76,
+                41, 40, 76, 46, 34, 31, 76, 42, 31, 30, 31, 45,
+                46, 27, 38, 64, 76, 46, 34, 31, 45, 31, 76, 49,
+                41, 44, 30, 45, 76, 27, 42, 42, 31, 27, 44, 65,
+                78, 76, 76, 76, 76, 76, 76, 76, 76, 13, 51, 76,
+                40, 27, 39, 31, 76, 35, 45, 76, 15, 52, 51, 39,
+                27, 40, 30, 35, 27, 45, 64, 76, 11, 35, 40, 33,
+                76, 41, 32, 76, 11, 35, 40, 33, 45, 66, 78, 76,
+                76, 76, 76, 76, 76, 76, 76, 12, 41, 41, 37, 76,
+                41, 40, 76, 39, 51, 76, 23, 41, 44, 37, 45, 64,
+                76, 51, 31, 76, 13, 35, 33, 34, 46, 51, 64, 76,
+                27, 40, 30, 76, 30, 31, 45, 42, 27, 35, 44, 67,
+                78, 76, 76, 76, 76, 76, 76, 76, 76, 14, 41, 46,
+                34, 35, 40, 33, 76, 28, 31, 45, 35, 30, 31, 76,
+                44, 31, 39, 27, 35, 40, 45, 63, 76, 18, 41, 47,
+                40, 30, 76, 46, 34, 31, 76, 30, 31, 29, 27, 51,
+                78, 76, 76, 76, 76, 76, 76, 76, 76, 15, 32, 76,
+                46, 34, 27, 46, 76, 29, 41, 38, 41, 45, 45, 27,
+                38, 76, 23, 44, 31, 29, 37, 64, 76, 28, 41, 47,
+                40, 30, 38, 31, 45, 45, 76, 27, 40, 30, 76, 28,
+                27, 44, 31, 78, 76, 76, 76, 76, 76, 76, 76, 76,
+                20, 34, 31, 76, 38, 41, 40, 31, 76, 27, 40, 30,
+                76, 38, 31, 48, 31, 38, 76, 45, 27, 40, 30, 45,
+                76, 45, 46, 44, 31, 46, 29, 34, 76, 32, 27, 44,
+                76, 27, 49, 27, 51, 78, 76, 76, 76, 76, 76, 76,
+                76, 76]])
         ]
+        # fmt: on
+        self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
+        self.assertTrue(torch.allclose(tokens[1], EXPECTED_OUTPUT[1]))
+        self.assertTrue(torch.allclose(tokens[2], EXPECTED_OUTPUT[2]))
 
-        self.assertTrue(inputs == EXPECTED_OUTPUT)
-        EXPECTED_MASK_OUTPUT = [-float("inf")] * 7 + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        self.assertTrue(attention_masks == EXPECTED_MASK_OUTPUT)
-
-
-# @require_tokenizers
-# class JukeboxTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
-#     tokenizer_class = JukeboxTokenizer
-#     rust_tokenizer_class = JukeboxTokenizerFast
-#     test_rust_tokenizer = True
-#     from_pretrained_kwargs = {"add_prefix_space": True}
-#     test_seq2seq = False
-
-#     def setUp(self):
-#         super().setUp()
-
-#         vocab = {
-#             "artist": {"Marron 5": 0, "Bob Marley": 1},
-#             "genres": {"Pop": 0, "Rap": 1},
-#             "lyrics": {
-#                 c: i
-#                 for c, i in enumerate(
-#                     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-'\"()[] \t\n"
-#                 )
-#             },
-#         }
-#         self.special_tokens_map = {"unk_token": "<unk>"}
-
-#         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-#         with open(self.vocab_file, "w", encoding="utf-8") as fp:
-#             fp.write(json.dumps(vocab) + "\n")
-
-#     def get_tokenizer(self, **kwargs):
-#         kwargs.update(self.special_tokens_map)
-#         return JukeboxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-#     def get_rust_tokenizer(self, **kwargs):
-#         kwargs.update(self.special_tokens_map)
-#         return JukeboxTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
-
-#     def get_input_output_texts(self, tokenizer):
-#         input_text = "lower newer"
-#         output_text = "lower newer"
-#         return input_text, output_text
-
-#     # TODO: mostly modify this part
-#     def test_full_tokenizer(self):
-#         tokenizer = JukeboxTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-#         text = "lower newer"
-#         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-#         tokens = tokenizer.tokenize(text, add_prefix_space=True)
-#         self.assertListEqual(tokens, bpe_tokens)
-
-#         input_tokens = tokens + [tokenizer.unk_token]
-#         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-#         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-#     def test_rust_and_python_full_tokenizers(self):
-#         if not self.test_rust_tokenizer:
-#             return
-
-#         tokenizer = self.get_tokenizer()
-#         rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-
-#         sequence = "lower newer"
-
-#         # Testing tokenization
-#         tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
-#         rust_tokens = rust_tokenizer.tokenize(sequence)
-#         self.assertListEqual(tokens, rust_tokens)
-
-#         # Testing conversion to ids without special tokens
-#         ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
-#         rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-#         self.assertListEqual(ids, rust_ids)
-
-#         # Testing conversion to ids with special tokens
-#         rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-#         ids = tokenizer.encode(sequence, add_prefix_space=True)
-#         rust_ids = rust_tokenizer.encode(sequence)
-#         self.assertListEqual(ids, rust_ids)
-
-#         # Testing the unknown token
-#         input_tokens = tokens + [rust_tokenizer.unk_token]
-#         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-#         self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-#     def test_pretokenized_inputs(self, *args, **kwargs):
-#         # It's very difficult to mix/test pretokenization with byte-level
-#         # And get both Jukebox and Roberta to work at the same time (mostly an issue of adding a space before the string)
-#         pass
-
-#     def test_padding(self, max_length=15):
-#         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-#             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-#                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-#                 # Simple input
-#                 s = "This is a simple input"
-#                 s2 = ["This is a simple input 1", "This is a simple input 2"]
-#                 p = ("This is a simple input", "This is a pair")
-#                 p2 = [
-#                     ("This is a simple input 1", "This is a simple input 2"),
-#                     ("This is a simple pair 1", "This is a simple pair 2"),
-#                 ]
-
-#                 # Simple input tests
-#                 self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
-
-#                 # Simple input
-#                 self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
-
-#                 # Simple input
-#                 self.assertRaises(
-#                     ValueError,
-#                     tokenizer_r.batch_encode_plus,
-#                     s2,
-#                     max_length=max_length,
-#                     padding="max_length",
-#                 )
-
-#                 # Pair input
-#                 self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
-
-#                 # Pair input
-#                 self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
-
-#                 # Pair input
-#                 self.assertRaises(
-#                     ValueError,
-#                     tokenizer_r.batch_encode_plus,
-#                     p2,
-#                     max_length=max_length,
-#                     padding="max_length",
-#                 )
+    @require_torch
+    def test_5b_lyrics_tokenizer(self):
+        """
+        The outputs are similar that open AI but do not have the same format as this one is adapted to the HF integration.
+        """
+        import torch
 
-#     # tokenizer has no padding token
-#     def test_padding_different_model_input_name(self):
-#         pass
+        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox-5b-lyrics")
+        tokens = tokenizer(**self.metas)["input_ids"]
+        # fmt: off
+        EXPECTED_OUTPUT = [
+            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
+            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
+            torch.tensor([[
+                0, 0, 0, 1069, 11, -1, -1, -1, -1, 9, 77, 39,
+                31, 46, 77, 27, 77, 46, 44, 27, 48, 31, 38, 38,
+                31, 44, 77, 32, 44, 41, 39, 77, 27, 40, 77, 27,
+                40, 46, 35, 43, 47, 31, 77, 38, 27, 40, 30, 64,
+                79, 77, 77, 77, 77, 77, 77, 77, 77, 23, 34, 41,
+                77, 45, 27, 35, 30, 77, 72, 20, 49, 41, 77, 48,
+                27, 45, 46, 77, 27, 40, 30, 77, 46, 44, 47, 40,
+                37, 38, 31, 45, 45, 77, 38, 31, 33, 45, 77, 41,
+                32, 77, 45, 46, 41, 40, 31, 79, 77, 77, 77, 77,
+                77, 77, 77, 77, 19, 46, 27, 40, 30, 77, 35, 40,
+                77, 46, 34, 31, 77, 30, 31, 45, 31, 44, 46, 63,
+                77, 63, 77, 63, 77, 63, 77, 14, 31, 27, 44, 77,
+                46, 34, 31, 39, 64, 77, 41, 40, 77, 46, 34, 31,
+                77, 45, 27, 40, 30, 64, 79, 77, 77, 77, 77, 77,
+                77, 77, 77, 8, 27, 38, 32, 77, 45, 47, 40, 37,
+                77, 27, 77, 45, 34, 27, 46, 46, 31, 44, 31, 30,
+                77, 48, 35, 45, 27, 33, 31, 77, 38, 35, 31, 45,
+                64, 77, 49, 34, 41, 45, 31, 77, 32, 44, 41, 49,
+                40, 64, 79, 77, 77, 77, 77, 77, 77, 77, 77, 1,
+                40, 30, 77, 49, 44, 35, 40, 37, 38, 31, 30, 77,
+                38, 35, 42, 64, 77, 27, 40, 30, 77, 45, 40, 31,
+                31, 44, 77, 41, 32, 77, 29, 41, 38, 30, 77, 29,
+                41, 39, 39, 27, 40, 30, 64, 79, 77, 77, 77, 77,
+                77, 77, 77, 77, 20, 31, 38, 38, 77, 46, 34, 27,
+                46, 77, 35, 46, 45, 77, 45, 29, 47, 38, 42, 46,
+                41, 44, 77, 49, 31, 38, 38, 77, 46, 34, 41, 45,
+                31, 77, 42, 27, 45, 45, 35, 41, 40, 45, 77, 44,
+                31, 27, 30, 79, 77, 77, 77, 77, 77, 77, 77, 77,
+                23, 34, 35, 29, 34, 77, 51, 31, 46, 77, 45, 47,
+                44, 48, 35, 48, 31, 64, 77, 45, 46, 27, 39, 42,
+                31, 30, 77, 41, 40, 77, 46, 34, 31, 45, 31, 77,
+                38, 35, 32, 31, 38, 31, 45, 45, 77, 46, 34, 35,
+                40, 33, 45, 64, 79, 77, 77, 77, 77, 77, 77, 77,
+                77, 20, 34, 31, 77, 34, 27, 40, 30, 77, 46, 34,
+                27, 46, 77, 39, 41, 29, 37, 31, 30, 77, 46, 34,
+                31, 39, 64, 77, 27, 40, 30, 77, 46, 34, 31, 77,
+                34, 31, 27, 44, 46, 77, 46, 34, 27, 46, 77, 32,
+                31, 30, 66, 79, 77, 77, 77, 77, 77, 77, 77, 77,
+                1, 40, 30, 77, 41, 40, 77, 46, 34, 31, 77, 42,
+                31, 30, 31, 45, 46, 27, 38, 64, 77, 46, 34, 31,
+                45, 31, 77, 49, 41, 44, 30, 45, 77, 27, 42, 42,
+                31, 27, 44, 65, 79, 77, 77, 77, 77, 77, 77, 77,
+                77, 13, 51, 77, 40, 27, 39, 31, 77, 35, 45, 77,
+                15, 52, 51, 39, 27, 40, 30, 35, 27, 45, 64, 77,
+                11, 35, 40, 33, 77, 41, 32, 77, 11, 35, 40, 33,
+                45, 66, 79, 77, 77, 77, 77, 77, 77, 77, 77, 12,
+                41, 41, 37, 77, 41, 40, 77, 39, 51, 77, 23, 41,
+                44, 37, 45, 64, 77, 51, 31, 77, 13, 35, 33, 34,
+                46, 51, 64, 77, 27, 40, 30, 77, 30, 31, 45, 42,
+                27, 35, 44, 67, 79, 77, 77, 77, 77, 77, 77, 77,
+                77, 14, 41, 46, 34, 35, 40, 33, 77, 28, 31, 45,
+                35, 30, 31, 77, 44, 31, 39, 27, 35, 40, 45, 63,
+                77, 18, 41, 47, 40, 30, 77, 46, 34, 31, 77, 30,
+                31, 29, 27, 51, 79, 77, 77, 77, 77, 77, 77, 77,
+                77, 15, 32, 77, 46, 34, 27, 46, 77, 29, 41, 38,
+                41, 45, 45, 27, 38, 77, 23, 44, 31, 29, 37, 64,
+                77, 28, 41, 47, 40, 30, 38, 31, 45, 45, 77, 27,
+                40, 30, 77, 28, 27, 44, 31, 79, 77, 77, 77, 77,
+                77, 77, 77, 77, 20, 34, 31, 77, 38, 41, 40, 31,
+                77, 27, 40, 30, 77, 38, 31, 48, 31, 38, 77, 45,
+                27, 40, 30, 45, 77, 45, 46, 44, 31, 46, 29, 34,
+                77, 32, 27, 44, 77, 27, 49, 27, 51, 79, 77, 77,
+                77, 77, 77, 77, 77, 77]])
+        ]
+        # fmt: on
+        self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
+        self.assertTrue(torch.allclose(tokens[1], EXPECTED_OUTPUT[1]))
+        self.assertTrue(torch.allclose(tokens[2], EXPECTED_OUTPUT[2]))

From 1c2e5a922ef735d51606df045b42f7a1e58dd3d2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 25 Jul 2022 09:37:32 +0000
Subject: [PATCH 049/196] delete notebook from wrong folder

---
 .../models/jukebox/jukebox_sampling.ipynb     | 131 ------------------
 1 file changed, 131 deletions(-)
 delete mode 100644 src/transformers/models/jukebox/jukebox_sampling.ipynb

diff --git a/src/transformers/models/jukebox/jukebox_sampling.ipynb b/src/transformers/models/jukebox/jukebox_sampling.ipynb
deleted file mode 100644
index 837840f2abfe8..0000000000000
--- a/src/transformers/models/jukebox/jukebox_sampling.ipynb
+++ /dev/null
@@ -1,131 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Music generation with HF's Jukebox model \n",
-    "For now the PR has not been merged on the official repo yet, and we have to use Arthur's branch `jukebox`. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install the correct transformer version \n",
-    "!git clone --branch=jukebox https://github.com/ArthurZucker/transformers.git\n",
-    "%pip install -e \".[dev]\"\n",
-    "!sudo apt-get install libsndfile-dev"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Lets import a few functionnalities and define the metadatas that will be used\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import JukeboxModel , JukeboxTokenizer\n",
-    "import time\n",
-    "import torch\n",
-    "from transformers.models.jukebox.modeling_jukebox import load_prompts\n",
-    "metas = dict(\n",
-    "    artist=\"The weeknd\",\n",
-    "    genres=\"Rap\",\n",
-    "    lyrics=\"\"\"I met a traveller from an antique land,\n",
-    "Who said \"Two vast and trunkless legs of stone\n",
-    "Stand in the desert. . . . Near them, on the sand,\n",
-    "Half sunk a shattered visage lies, whose frown,\n",
-    "And wrinkled lip, and sneer of cold command,\n",
-    "Tell that its sculptor well those passions read\n",
-    "Which yet survive, stamped on these lifeless things,\n",
-    "The hand that mocked them, and the heart that fed;\n",
-    "And on the pedestal, these words appear:\n",
-    "My name is Ozymandias, King of Kings;\n",
-    "Look on my Works, ye Mighty, and despair!\n",
-    "Nothing beside remains. Round the decay\n",
-    "Of that colossal Wreck, boundless and bare\n",
-    "The lone and level sands stretch far away\n",
-    "\"\"\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_id = \"ArthurZ/jukebox-1b-lyrics\"\n",
-    "model = JukeboxModel.from_pretrained(model_id).eval()\n",
-    "tokenizer = JukeboxTokenizer.from_pretrained(model_id)\n",
-    "tokens = tokenizer(**metas)[\"input_ids\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's start by generating from scratch only using the conditionning on the meta datas."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "start = time.time()\n",
-    "zs = model.ancestral_sample([i.cuda() for i in tokens], chunk_size=32, sample_length_in_seconds=10, offset = 30)\n",
-    "print(\"generation time for length : \",time.time()- start)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now lets load $\\frac{40000}{sampling rate}$ seconds of a random audio and generate a continuation! \n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "start = time.time()\n",
-    "x = load_prompts([\"prompts/test.wav\"],40000,model.config)\n",
-    "zs = model.primed_sample(x.cuda(), [i.cuda() for i in tokens], chunk_size=32, sample_length_in_seconds=60)\n",
-    "print(\"generation time for length : \",time.time()- start)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.7.12 ('base')",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.7.12"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From d90b6f54718291ed41c6fb465aa2d5903b92ac71 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Mon, 25 Jul 2022 11:37:55 +0200
Subject: [PATCH 050/196] Update
 src/transformers/models/jukebox/modeling_jukebox.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/jukebox/modeling_jukebox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 8563ff342ac0d..92b34561f9de5 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

From 141dbdb02cfa49271fc63459d2e4d1c0e3f8c72b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 25 Jul 2022 10:25:38 +0000
Subject: [PATCH 051/196] handle return tensor ( no import torch )

---
 .../models/jukebox/tokenization_jukebox.py    | 107 +++++++++++++++---
 1 file changed, 94 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 2f9fd8eff732b..00ed273b41440 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -18,17 +18,28 @@
 import json
 import os
 from json.encoder import INFINITY
-from typing import Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
 
-import torch
+import numpy as np
 
 import regex as re
 from tokenizers import normalizers
+from transformers.testing_utils import require_torch
+from transformers.utils.generic import _is_jax, _is_numpy
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...utils import logging
+from ...tokenization_utils_base import BatchEncoding
+from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
 
 
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+    if is_flax_available():
+        import jax.numpy as jnp  # noqa: F401
+
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {
@@ -53,7 +64,15 @@
     "jukebox": 512,  # corresonds to the dummy-model ?
 }
 
+"""" batch_outputs = BatchEncoding(
+    encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+)
+
+
+"""
 
+
+@require_torch
 def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
     """
     Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
@@ -73,6 +92,8 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
             Expected duration of the generated music, in samples. The duration has to be smaller than the total lenght,
             which represent the overall length of the signal,
     """
+    import torch
+
     full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
         tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens)), full_tokens])
@@ -108,9 +129,7 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     >>> from transformers import JukeboxTokenizer
     >>> tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
     >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
-    [[6785],[546], [0, 0, 0, 0, 0, 0, 0,   41,       38,       30,
-                    77,       46,       41,       49,       40,
-                    77,       44,       41,       27,       30] ]
+    ## TODO UPDATE THIS OUTPUT
     >>> tokenizer("Alan Jackson", "Country Rock")['input_ids']
     [6785],[546]]
     ```
@@ -330,8 +349,69 @@ def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
 
     # TODO : should add_token be implemeted for artists, genres and lyrics? Should it have
     # a type argument to add an artist token with self.getattr('artist') ?
+    def convert_to_tensors(
+        self, inputs, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
+    ):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (`str` or [`~utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+                `None`, no modification is done.
+            prepend_batch_axis (`int`, *optional*, defaults to `False`):
+                Whether or not to add the batch dimension during the conversion.
+        """
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.TENSORFLOW:
+            if not is_tf_available():
+                raise ImportError(
+                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
+                )
+            import tensorflow as tf
+
+            as_tensor = tf.constant
+            is_tensor = tf.is_tensor
+        elif tensor_type == TensorType.PYTORCH:
+            if not is_torch_available():
+                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+            import torch
+
+            as_tensor = torch.tensor
+            is_tensor = torch.is_tensor
+        elif tensor_type == TensorType.JAX:
+            if not is_flax_available():
+                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
+            import jax.numpy as jnp  # noqa: F811
+
+            as_tensor = jnp.array
+            is_tensor = _is_jax
+        else:
+            as_tensor = np.asarray
+            is_tensor = _is_numpy
+
+        # Do the tensor conversion in batch
+
+        try:
+            if prepend_batch_axis:
+                inputs = [inputs]
+
+            if not is_tensor(inputs):
+                inputs = as_tensor(inputs)
+        except:  # noqa E722
 
-    def __call__(self, artist, genres, lyrics, return_tensor="pt"):
+            raise ValueError(
+                "Unable to create tensor, you should probably activate truncation and/or padding "
+                "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
+            )
+
+        return inputs
+
+    def __call__(self, artist, genres, lyrics, return_tensors="pt") -> BatchEncoding:
         """Convert the raw string to a list of token ids
 
         Args:
@@ -356,16 +436,17 @@ def __call__(self, artist, genres, lyrics, return_tensor="pt"):
         attention_masks = [-INFINITY] * len(full_tokens[-1])
         # TODO properly handle the return pt tensor option
         input_ids = [
-            torch.tensor([input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]])
+            self.convert_to_tensors(
+                [input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]], tensor_type=return_tensors
+            )
             for i in range(len(self.version))
         ]
-        if return_tensor == "pt":
-            # TODO use BatchEncoding to support
-
-            return {
+        return BatchEncoding(
+            {
                 "input_ids": input_ids,
-                "attention_masks": torch.tensor(attention_masks),
+                "attention_masks": attention_masks,
             }
+        )
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """

From 7c8b228b77b311ee919eacf7255d77d90040e5b3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 25 Jul 2022 12:09:49 +0000
Subject: [PATCH 052/196] fix torch not found

---
 .../models/jukebox/modeling_jukebox.py        |   6 +-
 .../models/jukebox/tokenization_jukebox.py    |   7 +-
 tests/models/jukebox/test_modeling_jukebox.py | 104 +++++++++---------
 3 files changed, 54 insertions(+), 63 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 8563ff342ac0d..465d4e54724cf 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -3112,7 +3112,7 @@ def _sample(
         zs,
         labels,
         metas,
-        sample_levels,
+        sample_levels=None,
         chunk_size=32,
         sampling_temperature=0.98,
         lower_batch_size=16,
@@ -3154,6 +3154,8 @@ def _sample(
         ]
         hps = self.config
         self.start_time = time.strftime("%Y-%m-%d-%Hh%M")
+        if sample_levels is None:
+            sample_levels = range(len(self.priors))
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
             self.priors[level].to(zs[0].device).eval()
@@ -3179,7 +3181,7 @@ def _sample(
             if not os.path.exists(logdir):
                 os.makedirs(logdir)
             torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
-            save_wav(logdir, level, metas, x, hps.sr)
+            save_wav(logdir, level, metas=metas, audi=x, sr=hps.sr)
             if (
                 alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
             ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 00ed273b41440..875b486beb3bb 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -18,7 +18,7 @@
 import json
 import os
 from json.encoder import INFINITY
-from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -35,10 +35,6 @@
 if TYPE_CHECKING:
     if is_torch_available():
         import torch
-    if is_tf_available():
-        import tensorflow as tf
-    if is_flax_available():
-        import jax.numpy as jnp  # noqa: F401
 
 logger = logging.get_logger(__name__)
 
@@ -92,7 +88,6 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
             Expected duration of the generated music, in samples. The duration has to be smaller than the total lenght,
             which represent the overall length of the signal,
     """
-    import torch
 
     full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index f06ac25ca0e10..4f4f2dec3a458 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -48,33 +48,29 @@ class Jukebox1bModelTester(unittest.TestCase):
     """,
     )
     # fmt: off
-    EXPECTED_OUTPUT_2 = torch.tensor(
-        [
-            1864, 1536, 1213, 1869, 1321, 1597, 519, 947, 1177, 789, 1434, 653,
-            653, 653, 653, 653, 653, 653, 653, 653, 1007, 1472, 255, 1228,
-            555, 1272, 1379, 1423, 1673, 427, 1683, 1321, 475, 416, 1177, 1827,
-            1106, 1127, 1494, 812
-        ]
-    )
-    EXPECTED_OUTPUT_1 = torch.tensor(
-        [
-            1125, 1585, 1485, 2020, 1141, 1680, 381, 539, 1368, 642, 1585, 284,
-            717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
-            593, 1274, 1181, 1658, 1181, 1145, 2037, 1125, 556, 1014, 1045, 1858,
-            1749, 1803, 1440, 1145, 416, 416, 1372, 1079, 1045, 1320, 1764, 158,
-            2020, 1543, 2037, 416, 539, 2047, 1446, 885, 1749, 2047, 118, 1348,
-            1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
-            591, 1803, 58, 591, 529, 1079, 642, 591
-        ]
-    )
-    EXPECTED_OUTPUT_0 = torch.tensor(
-        [
-            1979, 1613, 290, 1843, 844, 1427, 293, 616, 1771, 632, 591, 290,
-            234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
-            290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
-            854, 1120, 1572, 719
-        ]
-    )
+    EXPECTED_OUTPUT_2 = [
+        1864, 1536, 1213, 1869, 1321, 1597, 519, 947, 1177, 789, 1434, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 1007, 1472, 255, 1228,
+        555, 1272, 1379, 1423, 1673, 427, 1683, 1321, 475, 416, 1177, 1827,
+        1106, 1127, 1494, 812
+    ]
+    
+    EXPECTED_OUTPUT_1 = [
+        1125, 1585, 1485, 2020, 1141, 1680, 381, 539, 1368, 642, 1585, 284,
+        717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
+        593, 1274, 1181, 1658, 1181, 1145, 2037, 1125, 556, 1014, 1045, 1858,
+        1749, 1803, 1440, 1145, 416, 416, 1372, 1079, 1045, 1320, 1764, 158,
+        2020, 1543, 2037, 416, 539, 2047, 1446, 885, 1749, 2047, 118, 1348,
+        1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
+        591, 1803, 58, 591, 529, 1079, 642, 591
+    ]
+    
+    EXPECTED_OUTPUT_0 = [
+        1979, 1613, 290, 1843, 844, 1427, 293, 616, 1771, 632, 591, 290,
+        234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
+        290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
+        854, 1120, 1572, 719
+    ]
     # fmt: on
 
     def prepare_inputs(self, model_id):
@@ -82,6 +78,7 @@ def prepare_inputs(self, model_id):
         tokens = tokenizer(**self.metas)["input_ids"]
         return tokens
 
+    @require_torch
     def test_sampling(self):
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id, cond_res_scale=[None, True, False]).eval()
@@ -107,6 +104,7 @@ def test_sampling(self):
         assert torch.allclose(zs[0][0, :40], self.EXPECTED_OUTPUT_0)
 
     @slow
+    @require_torch
     def test_slow_sampling(self):
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
@@ -148,34 +146,30 @@ class Jukebox5bModelTester(unittest.TestCase):
     )
 
     # fmt: off
-    EXPECTED_OUTPUT_2 = torch.tensor(
-        [
-            1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 1489, 653,
-            653, 653, 653, 653, 653, 653, 653, 653
-        ]
-    )
-    EXPECTED_OUTPUT_1 = torch.tensor(
-        [
-            1125, 416, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 1585, 284,
-            717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
-            593, 1274, 1181, 1658, 1181, 1145, 2037, 1125, 556, 1014, 1045, 1858,
-            1749, 1803, 1440, 1145, 416, 416, 1372, 1079, 1045, 1320, 1764, 158,
-            2020, 1543, 2037, 416, 539, 2047, 1446, 885, 1749, 2047, 118, 1348,
-            1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
-            591, 1803, 58, 591, 529, 1079, 642, 591
-        ]
-    )
-    EXPECTED_OUTPUT_0 = torch.tensor(
-        [
-            1755, 1061, 234, 1755, 290, 1572, 234, 491, 992, 417, 591, 290,
-            234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
-            290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
-            854, 1120, 1572, 719, 491, 34, 755, 632, 844, 755, 1802, 225,
-            2013, 1814, 1148, 616, 185, 1979, 1460, 983, 1168, 1613, 34, 1242,
-            632, 34, 34, 1982, 1510, 554, 983, 1784, 526, 1691, 1268, 1268,
-            290, 755, 34, 307, 222, 234, 648, 526
-        ]
-    )
+    EXPECTED_OUTPUT_2 = [
+        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 1489, 653,
+        653, 653, 653, 653, 653, 653, 653, 653
+    ]
+    
+    EXPECTED_OUTPUT_1 = [
+        1125, 416, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 1585, 284,
+        717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
+        593, 1274, 1181, 1658, 1181, 1145, 2037, 1125, 556, 1014, 1045, 1858,
+        1749, 1803, 1440, 1145, 416, 416, 1372, 1079, 1045, 1320, 1764, 158,
+        2020, 1543, 2037, 416, 539, 2047, 1446, 885, 1749, 2047, 118, 1348,
+        1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
+        591, 1803, 58, 591, 529, 1079, 642, 591
+    ]
+    
+    EXPECTED_OUTPUT_0 = [
+        1755, 1061, 234, 1755, 290, 1572, 234, 491, 992, 417, 591, 290,
+        234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
+        290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
+        854, 1120, 1572, 719, 491, 34, 755, 632, 844, 755, 1802, 225,
+        2013, 1814, 1148, 616, 185, 1979, 1460, 983, 1168, 1613, 34, 1242,
+        632, 34, 34, 1982, 1510, 554, 983, 1784, 526, 1691, 1268, 1268,
+        290, 755, 34, 307, 222, 234, 648, 526
+    ]
     # fmt: on
 
     def prepare_inputs(self, model_id):

From 1798aaf722a2f187be8167f9d4feb020f13f498a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 25 Jul 2022 12:40:26 +0000
Subject: [PATCH 053/196] style

---
 .../models/jukebox/modeling_jukebox.py        | 48 +++++++++++--------
 tests/models/jukebox/test_modeling_jukebox.py | 47 +++++++++---------
 2 files changed, 50 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 465d4e54724cf..487e2a9158a3b 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2905,13 +2905,19 @@ def get_starts(total_length, n_ctx, hop_length):
 def save_wav(fname, lvl, metas, aud, sr):
     import soundfile
 
-    artists, genres, lyrics = metas.values()
     # clip before saving?
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
     for i in list(range(aud.shape[0])):
-        soundfile.write(
-            f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav", aud[i], samplerate=sr, format="wav"
-        )
+        if metas is not None:
+            artists, genres, lyrics = metas[i]
+            soundfile.write(
+                f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav",
+                aud[i],
+                samplerate=sr,
+                format="wav",
+            )
+        else:
+            soundfile.write(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i], samplerate=sr, format="wav")
 
 
 def get_alignment(x, zs, labels, prior, level, fp16, hps):
@@ -3111,8 +3117,8 @@ def _sample(
         self,
         zs,
         labels,
-        metas,
         sample_levels=None,
+        metas=None,
         chunk_size=32,
         sampling_temperature=0.98,
         lower_batch_size=16,
@@ -3121,6 +3127,7 @@ def _sample(
         alignments=None,
         sample_tokens=None,
         offset=0,
+        save_wav=True,
     ):
         top_prior = self.priors[-1]
         sampling_kwargs = [
@@ -3177,17 +3184,18 @@ def _sample(
                 x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
             self.vqvae.to("cpu")
 
-            logdir = f"{self.start_time}/level_{level}"
-            if not os.path.exists(logdir):
-                os.makedirs(logdir)
-            torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
-            save_wav(logdir, level, metas=metas, audi=x, sr=hps.sr)
-            if (
-                alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
-            ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):
-                # either use level which will be the given lovel or use the total nb of levels?
-                # alignments = get_alignment(x, zs, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
-                pass  # TODO this is a really dirty fix
+            if save_wav:
+                logdir = f"{self.start_time}/level_{level}"
+                if not os.path.exists(logdir):
+                    os.makedirs(logdir)
+                torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
+                save_wav(logdir, level, metas=metas, aud=x, sr=hps.sr)
+                if (
+                    alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
+                ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):
+                    # either use level which will be the given lovel or use the total nb of levels?
+                    # alignments = get_alignment(x, zs, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
+                    pass  # TODO this is a really dirty fix
         return zs
 
     # Generate ancestral samples given a list of artists and genres
@@ -3198,15 +3206,15 @@ def ancestral_sample(self, labels, metas, n_samples=1, **sampling_kwargs):
         return zs
 
     # Continue ancestral sampling from previously saved codes
-    def continue_sample(self, zs, labels, **sampling_kwargs):
+    def continue_sample(self, zs, labels, metas, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
-        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
+        zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
         return zs
 
     # Upsample given already generated upper-level codes
     def upsample(self, zs, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors) - 1))
-        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
+        zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
         return zs
 
     # Prompt the model with raw audio input (dimension: NTC) and generate continuations
@@ -3216,5 +3224,5 @@ def primed_sample(self, x, labels, **sampling_kwargs):
         with torch.no_grad():
             zs = self.vqvae.encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
         self.vqvae.to("cpu")
-        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
+        zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
         return zs
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 4f4f2dec3a458..abf6dea9d29db 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -87,21 +87,21 @@ def test_sampling(self):
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10)
-        assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
+        zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
+        assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
-        zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
+        zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_tokens=10)
-        assert torch.allclose(zs[-2][0, :80], self.EXPECTED_OUTPUT_1)
+        zs = model._sample(zs, labels, [1], sample_tokens=10, save_wav=False)
+        assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
 
-        zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
+        zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_tokens=10)
-        assert torch.allclose(zs[0][0, :40], self.EXPECTED_OUTPUT_0)
+        zs = model._sample(zs, labels, [0], sample_tokens=10, save_wav=False)
+        assert torch.allclose(zs[0][0, :40], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     @require_torch
@@ -112,10 +112,8 @@ def test_slow_sampling(self):
         labels = [i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10)
-        print(zs[-1][0].cpu())
-        print(self.EXPECTED_OUTPUT_2)
-        assert torch.allclose(zs[-1][0].cpu(), self.EXPECTED_OUTPUT_2)
+        zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
+        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_OUTPUT_2))
 
     def test_vqvae(self):
         # implemented vavae decoding test at 3 levels using the expected outputs
@@ -184,21 +182,21 @@ def test_sampling(self):
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10)
-        assert torch.allclose(zs[-1][0], self.EXPECTED_OUTPUT_2)
+        zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
+        assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
-        zs[-1] = self.EXPECTED_OUTPUT_2.unsqueeze(0)
+        zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_tokens=10)
-        assert torch.allclose(zs[-2][0, :80], self.EXPECTED_OUTPUT_1)
+        zs = model._sample(zs, labels, [1], sample_tokens=10, save_wav=False)
+        assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
 
-        zs[-2] = self.EXPECTED_OUTPUT_1.unsqueeze(0)
+        zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_tokens=10)
-        assert torch.allclose(zs[0][0, :80], self.EXPECTED_OUTPUT_0)
+        zs = model._sample(zs, labels, [0], sample_tokens=10, save_wav=False)
+        assert torch.allclose(zs[0][0, :80], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     def test_slow_sampling(self):
@@ -208,8 +206,8 @@ def test_slow_sampling(self):
         labels = [i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10)
-        assert torch.allclose(zs[-1][0].cpu(), self.EXPECTED_OUTPUT_2)
+        zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
+        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_OUTPUT_2))
 
     def test_vqvae(self):
         # implement vavae decoding test at 3 levels using the expected outputs
@@ -217,6 +215,5 @@ def test_vqvae(self):
 
 
 if __name__ == "__main__":
-    tester = Jukebox5bModelTester()
-    tester.test_1b_lyrics()
-    tester.test_slow_sampling()
+    tester = Jukebox1bModelTester()
+    tester.test_sampling()

From 70f93cd0ec5d2a8d2f77f0d0ded424bcb196ec45 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 25 Jul 2022 16:16:21 +0000
Subject: [PATCH 054/196] fixed slow test for 1b lyric

---
 .../models/jukebox/modeling_jukebox.py        |  6 +--
 tests/models/jukebox/test_modeling_jukebox.py | 40 +++++++++++++++----
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index acad7cd82ca26..4cb8de577c7bd 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2909,7 +2909,7 @@ def save_wav(fname, lvl, metas, aud, sr):
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
     for i in list(range(aud.shape[0])):
         if metas is not None:
-            artists, genres, lyrics = metas[i]
+            artists, genres, lyrics = metas[i].values() # twitter prompts or inputs are in the form of a dictionnary
             soundfile.write(
                 f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav",
                 aud[i],
@@ -3212,13 +3212,13 @@ def continue_sample(self, zs, labels, metas, **sampling_kwargs):
         return zs
 
     # Upsample given already generated upper-level codes
-    def upsample(self, zs, labels, **sampling_kwargs):
+    def upsample(self, zs, labels, metas, **sampling_kwargs):
         sample_levels = list(range(len(self.priors) - 1))
         zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
         return zs
 
     # Prompt the model with raw audio input (dimension: NTC) and generate continuations
-    def primed_sample(self, x, labels, **sampling_kwargs):
+    def primed_sample(self, x, labels, metas, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
         self.vqvae.to(x.device)
         with torch.no_grad():
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index abf6dea9d29db..ff0bcea073596 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -54,7 +54,7 @@ class Jukebox1bModelTester(unittest.TestCase):
         555, 1272, 1379, 1423, 1673, 427, 1683, 1321, 475, 416, 1177, 1827,
         1106, 1127, 1494, 812
     ]
-    
+
     EXPECTED_OUTPUT_1 = [
         1125, 1585, 1485, 2020, 1141, 1680, 381, 539, 1368, 642, 1585, 284,
         717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
@@ -64,15 +64,22 @@ class Jukebox1bModelTester(unittest.TestCase):
         1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
         591, 1803, 58, 591, 529, 1079, 642, 591
     ]
-    
+
     EXPECTED_OUTPUT_0 = [
         1979, 1613, 290, 1843, 844, 1427, 293, 616, 1771, 632, 591, 290,
         234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
         290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
         854, 1120, 1572, 719
     ]
-    # fmt: on
 
+    EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
+    EXPECTED_GPU_OUTPUTS = [
+        1150, 384, 222, 1612, 1063, 710, 984, 710, 1272, 405, 784, 2001,
+        1276, 778, 937, 256, 1368, 1053, 1421, 405, 710, 1425, 445, 1489,
+        1895, 947, 317, 1082, 947, 669, 1527, 1321, 1807, 756, 1150, 1150,
+        1489, 1139, 519, 475
+    ]
+    # fmt: on
     def prepare_inputs(self, model_id):
         tokenizer = JukeboxTokenizer.from_pretrained(model_id)
         tokens = tokenizer(**self.metas)["input_ids"]
@@ -106,14 +113,25 @@ def test_sampling(self):
     @slow
     @require_torch
     def test_slow_sampling(self):
+        
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
         labels = [i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+
+        top_prior = model.priors[-1]
+        start = 0 
+        z_conds = top_prior.get_z_conds(zs, start = start, end = start + top_prior.n_ctx)
+        y = top_prior.get_y(labels[-1].clone(), start, 1058304 ,0)
+        
+        self.assertIsNone(z_conds)
+        self.assertListEqual(y.cpu().numpy()[0][:10].tolist(),self.EXPECTED_Y_COND)
+
+        set_seed(0)
         zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
-        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_OUTPUT_2))
+        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS))
 
     def test_vqvae(self):
         # implemented vavae decoding test at 3 levels using the expected outputs
@@ -148,7 +166,7 @@ class Jukebox5bModelTester(unittest.TestCase):
         1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 1489, 653,
         653, 653, 653, 653, 653, 653, 653, 653
     ]
-    
+
     EXPECTED_OUTPUT_1 = [
         1125, 416, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 1585, 284,
         717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
@@ -158,7 +176,7 @@ class Jukebox5bModelTester(unittest.TestCase):
         1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
         591, 1803, 58, 591, 529, 1079, 642, 591
     ]
-    
+
     EXPECTED_OUTPUT_0 = [
         1755, 1061, 234, 1755, 290, 1572, 234, 491, 992, 417, 591, 290,
         234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
@@ -210,10 +228,18 @@ def test_slow_sampling(self):
         assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_OUTPUT_2))
 
     def test_vqvae(self):
+        # test encoding of an audio 
+        # test decoding 
         # implement vavae decoding test at 3 levels using the expected outputs
         pass
 
+    def test_primed_sampling(self):
+        pass
+
+    def test_upsampling(self):
+        pass
+
 
 if __name__ == "__main__":
     tester = Jukebox1bModelTester()
-    tester.test_sampling()
+    tester.test_slow_sampling()

From a9df0a13702b4ee3ba7758049921c4326a3a5187 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 25 Jul 2022 16:18:00 +0000
Subject: [PATCH 055/196] style

---
 .../models/jukebox/modeling_jukebox.py           |  2 +-
 tests/models/jukebox/test_modeling_jukebox.py    | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 4cb8de577c7bd..ef6549f163208 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2909,7 +2909,7 @@ def save_wav(fname, lvl, metas, aud, sr):
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
     for i in list(range(aud.shape[0])):
         if metas is not None:
-            artists, genres, lyrics = metas[i].values() # twitter prompts or inputs are in the form of a dictionnary
+            artists, genres, lyrics = metas[i].values()  # twitter prompts or inputs are in the form of a dictionnary
             soundfile.write(
                 f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav",
                 aud[i],
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index ff0bcea073596..a5b64be68e353 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -113,7 +113,7 @@ def test_sampling(self):
     @slow
     @require_torch
     def test_slow_sampling(self):
-        
+
         model_id = "ArthurZ/jukebox-1b-lyrics"
         model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
 
@@ -122,12 +122,12 @@ def test_slow_sampling(self):
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
 
         top_prior = model.priors[-1]
-        start = 0 
-        z_conds = top_prior.get_z_conds(zs, start = start, end = start + top_prior.n_ctx)
-        y = top_prior.get_y(labels[-1].clone(), start, 1058304 ,0)
-        
+        start = 0
+        z_conds = top_prior.get_z_conds(zs, start=start, end=start + top_prior.n_ctx)
+        y = top_prior.get_y(labels[-1].clone(), start, 1058304, 0)
+
         self.assertIsNone(z_conds)
-        self.assertListEqual(y.cpu().numpy()[0][:10].tolist(),self.EXPECTED_Y_COND)
+        self.assertListEqual(y.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
 
         set_seed(0)
         zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
@@ -228,8 +228,8 @@ def test_slow_sampling(self):
         assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_OUTPUT_2))
 
     def test_vqvae(self):
-        # test encoding of an audio 
-        # test decoding 
+        # test encoding of an audio
+        # test decoding
         # implement vavae decoding test at 3 levels using the expected outputs
         pass
 

From 69845532b1662cca3b0c2bc721b55a883078eeb9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 06:55:39 +0000
Subject: [PATCH 056/196] add VQVAE as pretrained model

---
 docs/source/en/model_doc/jukebox.mdx          |   4 +
 src/transformers/__init__.py                  |  13 +-
 src/transformers/models/jukebox/__init__.py   |   8 +-
 .../models/jukebox/modeling_jukebox.py        |  31 +++--
 src/transformers/utils/dummy_pt_objects.py    |   7 ++
 tests/models/jukebox/test_modeling_jukebox.py | 111 +++++++++++++-----
 utils/check_repo.py                           |   2 +
 7 files changed, 124 insertions(+), 52 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 02330e0a27c99..b6dc40f2fa392 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -64,3 +64,7 @@ The original code can be found [here](https://github.com/openai/jukebox).
 ## JukeboxModel
 
 [[autodoc]] JukeboxModel - forward 
+
+## JukeboxVQVAE 
+
+[[autodoc]] JukeboxVQVAE - forward 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index dac81db82f445..53fc955e6e83e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1288,11 +1288,7 @@
         ]
     )
     _import_structure["models.jukebox"].extend(
-        [
-            "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "JukeboxModel",
-            "JukeboxPreTrainedModel",
-        ]
+        ["JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST", "JukeboxModel", "JukeboxPreTrainedModel", "JukeboxVQVAE"]
     )
     _import_structure["models.layoutlm"].extend(
         [
@@ -3888,7 +3884,12 @@
             ImageGPTPreTrainedModel,
             load_tf_weights_in_imagegpt,
         )
-        from .models.jukebox import JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST, JukeboxModel, JukeboxPreTrainedModel
+        from .models.jukebox import (
+            JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
+            JukeboxModel,
+            JukeboxPreTrainedModel,
+            JukeboxVQVAE,
+        )
         from .models.layoutlm import (
             LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
             LayoutLMForMaskedLM,
diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index 427be11cc948a..c243a7f3b67be 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -36,6 +36,7 @@
         "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
         "JukeboxModel",
         "JukeboxPreTrainedModel",
+        "JukeboxVQVAE",
     ]
 
 if TYPE_CHECKING:
@@ -48,7 +49,12 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_jukebox import JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST, JukeboxModel, JukeboxPreTrainedModel
+        from .modeling_jukebox import (
+            JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
+            JukeboxModel,
+            JukeboxPreTrainedModel,
+            JukeboxVQVAE,
+        )
 
 else:
     import sys
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index ef6549f163208..fdbc4c13e2925 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -698,9 +698,9 @@ def average_metrics(_metrics):
     return {key: sum(vals) / len(vals) for key, vals in metrics.items()}
 
 
-class VQVAE(nn.Module):
+class JukeboxVQVAE(PreTrainedModel):
     def __init__(self, config):
-        super().__init__()
+        super().__init__(config)
         if not config.sample_length:
             downsamples = calculate_strides(config.vq_vae_strides_t, config.vq_vae_downs_t)
             top_raw_to_tokens = np.prod(downsamples)
@@ -2097,7 +2097,6 @@ def primed_sample(
                 x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16)  # Transformer
                 if self.add_cond_after_transformer:
                     x = x + cond
-                assert x.shape == (n_samples, 1, self.width)
                 x = self.x_out(x)  # Predictions
                 if get_preds:
                     preds.append(x)
@@ -3022,7 +3021,7 @@ def __init__(self, config):
 
         self.embed_dim = config.hidden_size
 
-        self.vqvae = VQVAE(config)
+        self.vqvae = JukeboxVQVAE(config)
         config.vqvae_z_shapes = self.vqvae.z_shapes
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
@@ -3117,7 +3116,7 @@ def _sample(
         self,
         zs,
         labels,
-        sample_levels=None,
+        sample_levels,
         metas=None,
         chunk_size=32,
         sampling_temperature=0.98,
@@ -3127,7 +3126,7 @@ def _sample(
         alignments=None,
         sample_tokens=None,
         offset=0,
-        save_wav=True,
+        save_results=True,
     ):
         top_prior = self.priors[-1]
         sampling_kwargs = [
@@ -3165,7 +3164,7 @@ def _sample(
             sample_levels = range(len(self.priors))
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
-            self.priors[level].to(zs[0].device).eval()
+            self.priors[level].to(zs[level].device).eval()
             empty_cache()
             hps.sample_length = self.total_length  # generated length of the signal
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
@@ -3184,7 +3183,7 @@ def _sample(
                 x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
             self.vqvae.to("cpu")
 
-            if save_wav:
+            if save_results:
                 logdir = f"{self.start_time}/level_{level}"
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
@@ -3199,30 +3198,30 @@ def _sample(
         return zs
 
     # Generate ancestral samples given a list of artists and genres
-    def ancestral_sample(self, labels, metas, n_samples=1, **sampling_kwargs):
+    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         zs = [torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))]
-        zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
+        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
 
     # Continue ancestral sampling from previously saved codes
-    def continue_sample(self, zs, labels, metas, **sampling_kwargs):
+    def continue_sample(self, zs, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
-        zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
+        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
 
     # Upsample given already generated upper-level codes
-    def upsample(self, zs, labels, metas, **sampling_kwargs):
+    def upsample(self, zs, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors) - 1))
-        zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
+        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
 
     # Prompt the model with raw audio input (dimension: NTC) and generate continuations
-    def primed_sample(self, x, labels, metas, **sampling_kwargs):
+    def primed_sample(self, x, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
         self.vqvae.to(x.device)
         with torch.no_grad():
             zs = self.vqvae.encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
         self.vqvae.to("cpu")
-        zs = self._sample(zs, labels, metas, sample_levels, **sampling_kwargs)
+        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
         return zs
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index a4a8eb72017aa..01ea229a0b3dd 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2449,6 +2449,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class JukeboxVQVAE(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index a5b64be68e353..e224912f28cd9 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -28,6 +28,7 @@
 @require_torch
 class Jukebox1bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
+    model_id = "ArthurZ/jukebox-1b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -73,51 +74,76 @@ class Jukebox1bModelTester(unittest.TestCase):
     ]
 
     EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
+
     EXPECTED_GPU_OUTPUTS = [
-        1150, 384, 222, 1612, 1063, 710, 984, 710, 1272, 405, 784, 2001,
-        1276, 778, 937, 256, 1368, 1053, 1421, 405, 710, 1425, 445, 1489,
-        1895, 947, 317, 1082, 947, 669, 1527, 1321, 1807, 756, 1150, 1150,
-        1489, 1139, 519, 475
+        1489, 1489, 324, 1489, 1600, 1150, 1489, 1489, 947, 1400, 1684, 1408,
+        1368, 758, 49, 1331, 1244, 798, 228, 1240, 1224, 1150, 1150, 1150,
+        519, 475, 1643, 653, 1369, 30, 1434, 1434, 1489, 1864, 1106, 1877,
+        1434, 231, 1621, 1063
+    ]
+    EXPECTED_VQVAE = [
+        -0.0168, -0.0083, -0.0062, -0.0078, -0.0095, -0.0108, -0.0117, -0.0124,
+        -0.0138, -0.0149, -0.0148, -0.0140, -0.0136, -0.0130, -0.0125, -0.0120,
+        -0.0129, -0.0148, -0.0151, -0.0138, -0.0130, -0.0129, -0.0125, -0.0116,
+        -0.0119, -0.0130, -0.0129, -0.0116, -0.0113, -0.0118, -0.0112, -0.0104,
+        -0.0114, -0.0127, -0.0122, -0.0103, -0.0083, -0.0070, -0.0060, -0.0051
+    ]
+    EXPECTED_PRIMED_0 = [
+        390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
+        1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
+        1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
+        1907, 1788, 596, 1626
+    ]
+    EXPECTED_PRIMED_1 = [
+        1236, 1668, 1484, 1920, 1848, 1409, 139, 864, 1828, 1272, 1599, 824,
+        1672, 139, 555, 1484, 824, 1920, 555, 596, 1579, 1599, 1231, 1599,
+        1637, 1407, 212, 824, 1599, 116, 1433, 824, 258, 1599, 1433, 1895,
+        1063, 1433, 1433, 1599
+    ]
+    EXPECTED_PRIMED_2 = [
+        1684, 1873, 1119, 1189, 395, 611, 1901, 972, 890, 1337, 1392, 1927,
+        96, 972, 672, 780, 1119, 890, 158, 771, 1073, 1927, 353, 1331,
+        1269, 1459, 1333, 1645, 812, 1577, 1337, 606, 353, 981, 1466, 619,
+        197, 391, 302, 1930
     ]
     # fmt: on
-    def prepare_inputs(self, model_id):
-        tokenizer = JukeboxTokenizer.from_pretrained(model_id)
+
+    def prepare_inputs(self):
+        tokenizer = JukeboxTokenizer.from_pretrained(self.model_id)
         tokens = tokenizer(**self.metas)["input_ids"]
         return tokens
 
     @require_torch
     def test_sampling(self):
-        model_id = "ArthurZ/jukebox-1b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id, cond_res_scale=[None, True, False]).eval()
-
-        labels = self.prepare_inputs(model_id)
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=10).eval()
+        labels = self.prepare_inputs()
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
+        zs = model._sample(zs, labels, [2], sample_tokens=10, save_results=False, sample_length_in_seconds=10)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_tokens=10, save_wav=False)
+        zs = model._sample(zs, labels, [1], sample_tokens=10, save_results=False)
         assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_tokens=10, save_wav=False)
+        zs = model._sample(zs, labels, [0], sample_tokens=10, save_results=False)
         assert torch.allclose(zs[0][0, :40], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     @require_torch
     def test_slow_sampling(self):
+        torch.backends.cuda.matmul.allow_tf32 = False
 
-        model_id = "ArthurZ/jukebox-1b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
+        model = JukeboxModel.from_pretrained(self.model_id).eval()
 
-        labels = [i.cuda() for i in self.prepare_inputs(model_id)]
+        labels = [i.cuda() for i in self.prepare_inputs()]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
 
@@ -130,12 +156,45 @@ def test_slow_sampling(self):
         self.assertListEqual(y.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
 
         set_seed(0)
-        zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
+        zs = model._sample(zs, labels, [2], sample_tokens=10, save_results=False)
         assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS))
 
+    @slow
+    def test_primed_sampling(self):
+        torch.backends.cuda.matmul.allow_tf32 = False
+
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0.5).eval()
+        set_seed(0)
+        waveform = torch.rand((1, 5120, 1))
+        tokens = [i.cuda() for i in self.prepare_inputs()]
+
+        zs = [None, None, model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0].cuda()]
+        zs = model._sample(zs, tokens, sample_levels=[2], save_results=False, sample_length_in_seconds=1)
+        assert torch.allclose(zs[-1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
+
+        upper_2 = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
+        zs = [None, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), upper_2]
+        zs = model._sample(zs, tokens, sample_levels=[1], save_results=False, sample_length_in_seconds=1)
+        assert torch.allclose(zs[1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_1))
+
+        upper_1 = torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[-1]).cuda()), dim=-1).long()
+        zs = [model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0].cuda(), upper_1, upper_2]
+        zs = model._sample(zs, tokens, sample_levels=[0], save_results=False, sample_length_in_seconds=1)
+        assert torch.allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
+
+    @slow
     def test_vqvae(self):
         # implemented vavae decoding test at 3 levels using the expected outputs
-        pass
+        zs = torch.tensor(self.EXPECTED_OUTPUT_2)
+        with torch.no_grad():
+            x = self.vqvae.decode(zs, start_level=2, bs_chunks=zs.shape[0])
+        assert torch.allclose(x.cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS))
+
+        zs.to("gpu")
+        self.vqvae.to("gpu")
+        with torch.no_grad():
+            x = self.vqvae.decode(zs, start_level=2, bs_chunks=zs.shape[0])
+        assert torch.allclose(x.cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS))
 
 
 @require_torch
@@ -200,20 +259,20 @@ def test_sampling(self):
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
+        zs = model._sample(zs, labels, [2], sample_tokens=10, save_results=False)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_tokens=10, save_wav=False)
+        zs = model._sample(zs, labels, [1], sample_tokens=10, save_results=False)
         assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_tokens=10, save_wav=False)
+        zs = model._sample(zs, labels, [0], sample_tokens=10, save_results=False)
         assert torch.allclose(zs[0][0, :80], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
@@ -224,7 +283,7 @@ def test_slow_sampling(self):
         labels = [i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10, save_wav=False)
+        zs = model._sample(zs, labels, [2], sample_tokens=10, save_results=False)
         assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_OUTPUT_2))
 
     def test_vqvae(self):
@@ -233,13 +292,7 @@ def test_vqvae(self):
         # implement vavae decoding test at 3 levels using the expected outputs
         pass
 
-    def test_primed_sampling(self):
-        pass
-
-    def test_upsampling(self):
-        pass
-
 
 if __name__ == "__main__":
     tester = Jukebox1bModelTester()
-    tester.test_slow_sampling()
+    tester.test_primed_sampling()
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 47fee163137f1..5b7b9201f6513 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -46,6 +46,7 @@
 # Being in this list is an exception and should **not** be the rule.
 IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
     # models to ignore for not tested
+    "JukeboxVQVAE",  # Building part of bigger (tested) model.
     "OPTDecoder",  # Building part of bigger (tested) model.
     "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
     "SegformerDecodeHead",  # Building part of bigger (tested) model.
@@ -125,6 +126,7 @@
 # should **not** be the rule.
 IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
     # models to ignore for model xxx mapping
+    "JukeboxVQVAE",
     "DPTForDepthEstimation",
     "DecisionTransformerGPT2Model",
     "GLPNForDepthEstimation",

From a4ca9217099f2fd42a5dd39134eacac705845282 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 07:01:08 +0000
Subject: [PATCH 057/196] update indexes

---
 README.md                | 2 +-
 README_ko.md             | 2 +-
 README_zh-hans.md        | 2 +-
 README_zh-hant.md        | 2 +-
 docs/source/en/index.mdx | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index f274c5b71b091..c75a6f25d7ba6 100644
--- a/README.md
+++ b/README.md
@@ -304,7 +304,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/README_ko.md b/README_ko.md
index e5148e5dfd963..fee20cce37ad6 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -260,7 +260,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 3f8c8b0660501..168c0769be0ea 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -284,7 +284,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 由 Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 62211a1449e0d..a31b6a8d26124 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -296,7 +296,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index e93474c1805c1..325710bb856e2 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -102,7 +102,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](model_doc/jukebox)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Jukebox](model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.

From 7ac95301c8db834107e6d9590cae5e89eaa23718 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 07:11:11 +0000
Subject: [PATCH 058/196] update and clean model doc

---
 docs/source/en/model_doc/jukebox.mdx | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index b6dc40f2fa392..268bbac44ef5f 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -1,15 +1,4 @@
----
-language: 
-  - "List of ISO 639-1 code for your language"
-  - en
-  - lang2
-thumbnail: "https://cdn.openai.com/research-covers/jukebox/2x-no-mark.jpg"
-tags:
-- MusicGeneration
-- transformers
----
-
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -20,7 +9,6 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
-
 # Jukebox
 
 ## Overview

From 47a220dae9906641b94ffba694d87930c284a4ef Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 07:50:40 +0000
Subject: [PATCH 059/196] update tests

---
 src/transformers/models/jukebox/modeling_jukebox.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index fdbc4c13e2925..aa4e7112e39e7 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2908,7 +2908,7 @@ def save_wav(fname, lvl, metas, aud, sr):
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
     for i in list(range(aud.shape[0])):
         if metas is not None:
-            artists, genres, lyrics = metas[i].values()  # twitter prompts or inputs are in the form of a dictionnary
+            artists, genres, lyrics = list(metas[i].values())  # twitter prompts or inputs are in the form of a dictionnary
             soundfile.write(
                 f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav",
                 aud[i],
@@ -3127,8 +3127,10 @@ def _sample(
         sample_tokens=None,
         offset=0,
         save_results=True,
+        sample_length=None
     ):
         top_prior = self.priors[-1]
+        total_length = sample_length if sample_length is not None else (int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)* top_prior.raw_to_tokens
         sampling_kwargs = [
             dict(
                 temp=0.99,
@@ -3136,8 +3138,7 @@ def _sample(
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
-                total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
+                total_length=total_length,
             ),
             dict(
                 temp=0.99,
@@ -3145,8 +3146,7 @@ def _sample(
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
-                total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
+                total_length=total_length,
             ),
             dict(
                 temp=sampling_temperature,
@@ -3154,8 +3154,7 @@ def _sample(
                 max_batch_size=max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
-                total_length=(int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)
-                * top_prior.raw_to_tokens,
+                total_length=total_length,
             ),
         ]
         hps = self.config

From fc176435ecad1859fb2398a15fe87318978a0e59 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 07:50:46 +0000
Subject: [PATCH 060/196] update test

---
 tests/models/jukebox/test_modeling_jukebox.py | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index e224912f28cd9..e8dd0b2659806 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -15,7 +15,7 @@
 import unittest
 
 from transformers import is_torch_available
-from transformers.testing_utils import require_torch, slow
+from transformers.testing_utils import require_accelerate, require_torch, slow
 from transformers.trainer_utils import set_seed
 
 
@@ -114,26 +114,27 @@ def prepare_inputs(self):
         return tokens
 
     @require_torch
+    @require_accelerate
     def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=10).eval()
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration = 0, device_map="auto").eval()
         labels = self.prepare_inputs()
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10, save_results=False, sample_length_in_seconds=10)
+        zs = model._sample(zs, labels, [2], sample_length=40*model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_tokens=10, save_results=False)
+        zs = model._sample(zs, labels, [1], sample_length=40*model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_tokens=10, save_results=False)
+        zs = model._sample(zs, labels, [0], sample_length=40*model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0, :40], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
@@ -254,26 +255,26 @@ def prepare_inputs(self, model_id):
 
     def test_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id).eval()
+        model = JukeboxModel.from_pretrained(model_id,min_duration=0).eval()
 
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10, save_results=False)
+        zs = model._sample(zs, labels, [2], sample_length=40*model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_tokens=10, save_results=False)
+        zs = model._sample(zs, labels, [1], sample_length=40*model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_tokens=10, save_results=False)
-        assert torch.allclose(zs[0][0, :80], torch.tensor(self.EXPECTED_OUTPUT_0))
+        zs = model._sample(zs, labels, [0], sample_length=40*model.priors[-3].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0, :40], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     def test_slow_sampling(self):
@@ -295,4 +296,4 @@ def test_vqvae(self):
 
 if __name__ == "__main__":
     tester = Jukebox1bModelTester()
-    tester.test_primed_sampling()
+    tester.test_sampling()

From 55f2b13f41a6e9f9dae1f6aa9e421e57512cd7fc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 09:39:22 +0000
Subject: [PATCH 061/196] fix slow sampling that is now faster + added all lvl
 logits

---
 tests/models/jukebox/test_modeling_jukebox.py | 83 ++++++++++++-------
 1 file changed, 52 insertions(+), 31 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index e8dd0b2659806..1a1ea3a806e22 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -50,36 +50,45 @@ class Jukebox1bModelTester(unittest.TestCase):
     )
     # fmt: off
     EXPECTED_OUTPUT_2 = [
-        1864, 1536, 1213, 1869, 1321, 1597, 519, 947, 1177, 789, 1434, 653,
-        653, 653, 653, 653, 653, 653, 653, 653, 1007, 1472, 255, 1228,
-        555, 1272, 1379, 1423, 1673, 427, 1683, 1321, 475, 416, 1177, 1827,
-        1106, 1127, 1494, 812
+        1434, 324, 1489, 756, 1224, 1150, 1489, 353, 2033, 1622, 1536, 519,
+        475, 1996, 1643, 701, 1229, 1434, 10, 1420, 1306, 178, 409, 2038,
+        1355, 286, 897, 1804, 253, 1643, 1685, 1769, 1002, 1597, 30, 539,
+        376, 427, 1179, 286
     ]
 
     EXPECTED_OUTPUT_1 = [
-        1125, 1585, 1485, 2020, 1141, 1680, 381, 539, 1368, 642, 1585, 284,
-        717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
-        593, 1274, 1181, 1658, 1181, 1145, 2037, 1125, 556, 1014, 1045, 1858,
-        1749, 1803, 1440, 1145, 416, 416, 1372, 1079, 1045, 1320, 1764, 158,
-        2020, 1543, 2037, 416, 539, 2047, 1446, 885, 1749, 2047, 118, 1348,
-        1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
-        591, 1803, 58, 591, 529, 1079, 642, 591
+        1125, 1125, 904, 2037, 1125, 1274, 317, 642, 1274, 317, 851, 642,
+        642, 747, 867, 502, 502, 416, 1125, 1125, 317, 1274, 317, 1125,
+        416, 1125, 1125, 1125, 1125, 317, 855, 844, 94, 855, 502, 1714,
+        1107, 747, 1353, 573
     ]
 
     EXPECTED_OUTPUT_0 = [
-        1979, 1613, 290, 1843, 844, 1427, 293, 616, 1771, 632, 591, 290,
-        234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
-        290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
-        854, 1120, 1572, 719
+        1755, 1061, 808, 1755, 992, 1572, 185, 491, 992, 417, 234, 89,
+        234, 417, 234, 234, 234, 417, 1638, 1638, 677, 1659, 541, 1659,
+        946, 579, 992, 556, 844, 329, 926, 556, 293, 579, 946, 1659,
+        1562, 579, 1372, 290
     ]
 
     EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
 
-    EXPECTED_GPU_OUTPUTS = [
-        1489, 1489, 324, 1489, 1600, 1150, 1489, 1489, 947, 1400, 1684, 1408,
-        1368, 758, 49, 1331, 1244, 798, 228, 1240, 1224, 1150, 1150, 1150,
-        519, 475, 1643, 653, 1369, 30, 1434, 1434, 1489, 1864, 1106, 1877,
-        1434, 231, 1621, 1063
+    EXPECTED_GPU_OUTPUTS_0 = [
+        591, 1979, 89, 1332, 1572, 755, 844, 1022, 234, 1174, 1962, 1174,
+        1755, 676, 58, 1756, 844, 739, 185, 1332, 806, 1180, 774, 842,
+        306, 442, 1797, 734, 1081, 109, 806, 1492, 926, 2008, 844, 2008,
+        992, 89, 1353, 637
+    ]
+    EXPECTED_GPU_OUTPUTS_1 = [
+        1125, 2037, 317, 1372, 2037, 851, 1274, 1125, 642, 502, 1274, 851,
+        1125, 502, 317, 1125, 880, 904, 317, 1125, 642, 502, 844, 851,
+        416, 317, 1585, 642, 1125, 58, 697, 1125, 1585, 2037, 502, 2037,
+        851, 317, 1125, 642
+    ]
+    EXPECTED_GPU_OUTPUTS_2 = [
+        1489, 1489, 324, 1489, 1600, 1150, 1489, 1489, 947, 1357, 1600, 1417,
+        1481, 1003, 141, 1165, 1303, 904, 303, 1369, 395, 461, 994, 1283,
+        269, 35, 1699, 241, 1369, 35, 1303, 583, 825, 1941, 1089, 1944,
+        581, 35, 1153, 1153
     ]
     EXPECTED_VQVAE = [
         -0.0168, -0.0083, -0.0062, -0.0078, -0.0095, -0.0108, -0.0117, -0.0124,
@@ -114,9 +123,8 @@ def prepare_inputs(self):
         return tokens
 
     @require_torch
-    @require_accelerate
     def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration = 0, device_map="auto").eval()
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration = 0).eval()
         labels = self.prepare_inputs()
 
         set_seed(0)
@@ -128,21 +136,21 @@ def test_sampling(self):
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, [1], sample_length=40*model.priors[-2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
+        assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, [0], sample_length=40*model.priors[-3].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0, :40], torch.tensor(self.EXPECTED_OUTPUT_0))
+        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     @require_torch
     def test_slow_sampling(self):
         torch.backends.cuda.matmul.allow_tf32 = False
 
-        model = JukeboxModel.from_pretrained(self.model_id).eval()
+        model = JukeboxModel.from_pretrained(self.model_id,min_duration=0).eval()
 
         labels = [i.cuda() for i in self.prepare_inputs()]
         set_seed(0)
@@ -157,8 +165,22 @@ def test_slow_sampling(self):
         self.assertListEqual(y.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
 
         set_seed(0)
-        zs = model._sample(zs, labels, [2], sample_tokens=10, save_results=False)
-        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS))
+        zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        zs = model._sample(zs, labels, [2], sample_length=40*model.priors[-1].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+
+        zs[-1] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_2).unsqueeze(0)
+        set_seed(0)
+        zs[-1] = torch.cat((zs[-1].cuda(), torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
+        zs = model._sample(zs, labels, [1], sample_length=40*model.priors[-2].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+
+        zs[-2] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_1).unsqueeze(0)
+
+        set_seed(0)
+        zs[-2] = torch.cat((zs[-2].cuda(), torch.zeros(1, 1000000 - zs[-2].shape[-1]).cuda()), dim=-1).long()
+        zs = model._sample(zs, labels, [0], sample_length=40*model.priors[-3].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
     @slow
     def test_primed_sampling(self):
@@ -185,17 +207,16 @@ def test_primed_sampling(self):
 
     @slow
     def test_vqvae(self):
-        # implemented vavae decoding test at 3 levels using the expected outputs
         zs = torch.tensor(self.EXPECTED_OUTPUT_2)
         with torch.no_grad():
             x = self.vqvae.decode(zs, start_level=2, bs_chunks=zs.shape[0])
-        assert torch.allclose(x.cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS))
+        assert torch.allclose(x.cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
         zs.to("gpu")
         self.vqvae.to("gpu")
         with torch.no_grad():
             x = self.vqvae.decode(zs, start_level=2, bs_chunks=zs.shape[0])
-        assert torch.allclose(x.cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS))
+        assert torch.allclose(x.cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
 
 @require_torch
@@ -296,4 +317,4 @@ def test_vqvae(self):
 
 if __name__ == "__main__":
     tester = Jukebox1bModelTester()
-    tester.test_sampling()
+    tester.test_slow_sampling()

From f121fceb4210ebe9e2d39e0d958b6d4271c70423 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 10:01:09 +0000
Subject: [PATCH 062/196] 1b lyrics testing is full and finished

---
 tests/models/jukebox/test_modeling_jukebox.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 1a1ea3a806e22..976c02517e02b 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -186,23 +186,23 @@ def test_slow_sampling(self):
     def test_primed_sampling(self):
         torch.backends.cuda.matmul.allow_tf32 = False
 
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0.5).eval()
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
         set_seed(0)
         waveform = torch.rand((1, 5120, 1))
         tokens = [i.cuda() for i in self.prepare_inputs()]
 
         zs = [None, None, model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0].cuda()]
-        zs = model._sample(zs, tokens, sample_levels=[2], save_results=False, sample_length_in_seconds=1)
+        zs = model._sample(zs, tokens, sample_levels=[2], save_results=False, sample_length=40*model.priors[-1].raw_to_tokens)
         assert torch.allclose(zs[-1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
 
         upper_2 = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
         zs = [None, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), upper_2]
-        zs = model._sample(zs, tokens, sample_levels=[1], save_results=False, sample_length_in_seconds=1)
+        zs = model._sample(zs, tokens, sample_levels=[1], save_results=False, sample_length=40*model.priors[-2].raw_to_tokens)
         assert torch.allclose(zs[1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_1))
 
         upper_1 = torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[-1]).cuda()), dim=-1).long()
         zs = [model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0].cuda(), upper_1, upper_2]
-        zs = model._sample(zs, tokens, sample_levels=[0], save_results=False, sample_length_in_seconds=1)
+        zs = model._sample(zs, tokens, sample_levels=[0], save_results=False, sample_length=40*model.priors[-3].raw_to_tokens)
         assert torch.allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
 
     @slow
@@ -317,4 +317,4 @@ def test_vqvae(self):
 
 if __name__ == "__main__":
     tester = Jukebox1bModelTester()
-    tester.test_slow_sampling()
+    tester.test_primed_sampling()

From 7777fc6fb18c5462ce6d48d8d709a8c41c0df075 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 10:01:41 +0000
Subject: [PATCH 063/196] style

---
 .../models/jukebox/modeling_jukebox.py        |  12 +-
 .../models/jukebox/sample_original_jukebox.py | 205 ++++++++++++++++++
 2 files changed, 214 insertions(+), 3 deletions(-)
 create mode 100644 src/transformers/models/jukebox/sample_original_jukebox.py

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index aa4e7112e39e7..a9fb080fd404a 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2908,7 +2908,9 @@ def save_wav(fname, lvl, metas, aud, sr):
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
     for i in list(range(aud.shape[0])):
         if metas is not None:
-            artists, genres, lyrics = list(metas[i].values())  # twitter prompts or inputs are in the form of a dictionnary
+            artists, genres, lyrics = list(
+                metas[i].values()
+            )  # twitter prompts or inputs are in the form of a dictionnary
             soundfile.write(
                 f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav",
                 aud[i],
@@ -3127,10 +3129,14 @@ def _sample(
         sample_tokens=None,
         offset=0,
         save_results=True,
-        sample_length=None
+        sample_length=None,
     ):
         top_prior = self.priors[-1]
-        total_length = sample_length if sample_length is not None else (int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens)* top_prior.raw_to_tokens
+        total_length = (
+            sample_length
+            if sample_length is not None
+            else (int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens) * top_prior.raw_to_tokens
+        )
         sampling_kwargs = [
             dict(
                 temp=0.99,
diff --git a/src/transformers/models/jukebox/sample_original_jukebox.py b/src/transformers/models/jukebox/sample_original_jukebox.py
new file mode 100644
index 0000000000000..549897e3fc79a
--- /dev/null
+++ b/src/transformers/models/jukebox/sample_original_jukebox.py
@@ -0,0 +1,205 @@
+# in order to be used, the following git repo has to be used :
+# git clone --branch adaptive_device https://github.com/ArthurZucker/jukebox.git
+import os
+
+import torch as t
+
+from jukebox.hparams import HPARAMS_REGISTRY, Hyperparams, setup_hparams
+from jukebox.make_models import MODELS, make_prior, make_vqvae
+from jukebox.sample import _sample
+from jukebox.utils.dist_utils import setup_dist_from_mpi
+from jukebox.utils.torch_utils import empty_cache
+
+
+rank, local_rank, device = setup_dist_from_mpi()
+import random
+
+import numpy as np
+import torch
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.enabled = False
+
+
+def log_zs(zs, level, model, save_dir="logits"):
+    os.makedirs(save_dir, exist_ok=True)
+    with open(f"{save_dir}/{model}_{level}.txt", "w") as file:
+        file.write(str(zs[level][0].cpu()))
+
+
+def get_args(model):
+    sampling_temperature = 0.98
+    lower_batch_size = 16
+    max_batch_size = 1 if model == "5b_lyrics" else 16
+    lower_level_chunk_size = 32
+    chunk_size = 16 if model == "5b_lyrics" else 32
+    sampling_kwargs = [
+        dict(
+            temp=0.99,
+            fp16=False,
+            max_batch_size=lower_batch_size,
+            chunk_size=lower_level_chunk_size,
+            sample_tokens=10,
+        ),
+        dict(
+            temp=0.99,
+            fp16=False,
+            max_batch_size=lower_batch_size,
+            chunk_size=lower_level_chunk_size,
+            sample_tokens=10,
+        ),
+        dict(
+            temp=sampling_temperature,
+            fp16=False,
+            max_batch_size=max_batch_size,
+            chunk_size=chunk_size,
+            sample_tokens=10,
+        ),
+    ]
+    return sampling_kwargs
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def test_sampling(model, device, tokens=40):
+    hps = Hyperparams()
+    hps.device = device
+    hps.sr = 44100
+    hps.n_samples = 1
+    hps.name = "samples"
+    hps.levels = 3
+    hps.hop_fraction = [0.5, 0.5, 0.125]
+    HPARAMS_REGISTRY[f"prior_{model}"][
+        "min_duration"
+    ] = 0  # set the minium duration of the model to 0 to generate only 40 tokens
+    vqvae, *priors = MODELS[model]
+    vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length=264576)), device)  # before : 1048576, 2645888
+    top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)
+    hps.sample_length = tokens * top_prior.raw_to_tokens
+    metas = (
+        [
+            dict(
+                artist="Zac Brown Band",
+                genre="Country",
+                total_length=hps.sample_length,
+                offset=0,
+                lyrics="""I met a traveller from an antique land,
+    Who said "Two vast and trunkless legs of stone Stand in the desert. . . . Near them, on the sand, Half sunk a
+    shattered visage lies, whose frown, And wrinkled lip, and sneer of cold command, Tell that its sculptor well those
+    passions read Which yet survive, stamped on these lifeless things, The hand that mocked them, and the heart that
+    fed; And on the pedestal, these words appear: My name is Ozymandias, King of Kings; Look on my Works, ye Mighty,
+    and despair! Nothing beside remains. Round the decay Of that colossal Wreck, boundless and bare The lone and level
+    sands stretch far away
+    """,
+            ),
+        ]
+        * hps.n_samples
+    )
+
+    labels = [None, None, top_prior.labeller.get_batch_labels(metas, device)]
+    sampling_kwargs = get_args(model)
+    hps.sample_length = tokens * top_prior.raw_to_tokens
+
+    set_seed(0)
+    zs = [t.zeros(hps.n_samples, 0, dtype=t.long, device=device) for _ in range(len(priors))]
+    zs = _sample(zs, labels, sampling_kwargs, [None, None, top_prior], [2], hps)
+    log_zs(zs, 2, f"{model}-{device}")
+
+    del top_prior
+    empty_cache()
+    upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, device) for prior in priors[:-1]]
+    labels[:2] = [prior.labeller.get_batch_labels(metas, device) for prior in upsamplers]
+
+    set_seed(0)
+    zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).to(device)), dim=-1).long()
+    hps.sample_length = tokens * upsamplers[1].raw_to_tokens
+    zs = _sample(zs, labels, sampling_kwargs, [None, upsamplers[1], None], [1], hps)
+    log_zs(zs, 1, f"{model}-{device}")
+
+    set_seed(0)
+    hps.sample_length = tokens * upsamplers[0].raw_to_tokens
+    zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).to(device)), dim=-1).long()
+    zs = _sample(zs, labels, sampling_kwargs, [upsamplers[0], None, None], [0], hps)
+    log_zs(zs, 0, f"{model}-{device}")
+
+    empty_cache()
+    del upsamplers
+    upsamplers = None
+
+
+def test_prime_samling(model, device):
+    hps = Hyperparams()
+    hps.device = device
+    hps.sr = 44100
+    hps.n_samples = 1
+    hps.name = "samples"
+    hps.levels = 3
+    hps.hop_fraction = [0.5, 0.5, 0.125]
+    HPARAMS_REGISTRY[f"prior_{model}"]["min_duration"] = 0
+    vqvae, *priors = MODELS[model]
+    vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length=264576)), device)  # before : 1048576, 2645888
+    top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)
+    hps.sample_length = tokens * top_prior.raw_to_tokens
+    metas = (
+        [
+            dict(
+                artist="Zac Brown Band",
+                genre="Country",
+                total_length=hps.sample_length,
+                offset=0,
+                lyrics="""I met a traveller from an antique land,
+    Who said "Two vast and trunkless legs of stone Stand in the desert. . . . Near them, on the sand, Half sunk a
+    shattered visage lies, whose frown, And wrinkled lip, and sneer of cold command, Tell that its sculptor well those
+    passions read Which yet survive, stamped on these lifeless things, The hand that mocked them, and the heart that
+    fed; And on the pedestal, these words appear: My name is Ozymandias, King of Kings; Look on my Works, ye Mighty,
+    and despair! Nothing beside remains. Round the decay Of that colossal Wreck, boundless and bare The lone and level
+    sands stretch far away
+    """,
+            ),
+        ]
+        * hps.n_samples
+    )
+    labels = [None, None, top_prior.labeller.get_batch_labels(metas, device)]
+    sampling_kwargs = get_args(model)
+
+    x = torch.rand((1, 5120, 1)).cuda()
+    vqvae.to("cuda")
+    zs = [None, None, top_prior.encode(x, start_level=2, bs_chunks=x.shape[0])[0].cuda()]
+    zs = _sample(zs, labels, sampling_kwargs, [None, None, top_prior], [2], hps)
+
+    if True:
+        del top_prior
+        empty_cache()
+        top_prior = None
+    upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, "cuda") for prior in priors[:-1]]
+    labels = [
+        upsamplers[0].labeller.get_batch_labels(metas, "cuda"),
+        upsamplers[0].labeller.get_batch_labels(metas, "cuda"),
+        None,
+    ]
+
+    zs = [
+        None,
+        upsamplers[-1].encode(x, start_level=1, bs_chunks=x.shape[0])[0].cuda(),
+        torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long(),
+    ]
+    zs = _sample(zs, labels, sampling_kwargs, [None, upsamplers[1], None], [1], hps)
+
+    zs = [
+        upsamplers[-1].encode(x, start_level=0, bs_chunks=x.shape[0])[0].cuda(),
+        torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[1]).cuda()), dim=-1).long(),
+        torch.zeros(1, 1000000).cuda().long(),
+    ]
+    zs = _sample(zs, labels, sampling_kwargs, [upsamplers[0], None, None], [0], hps)
+
+
+# test_sampling("1b_lyrics","cpu")
+test_sampling("1b_lyrics", "cuda")
+test_sampling("5b_lyrics", "cpu", tokens=60)
+test_sampling("5b_lyrics", "cuda", tokens=60)

From 69d90811555fb64bc9ab2a96a6c6957d6b0f6b94 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 10:05:19 +0000
Subject: [PATCH 064/196] update original sampling tests

---
 .../models/jukebox/sample_original_jukebox.py | 46 ++++++++++++-------
 tests/models/jukebox/test_modeling_jukebox.py | 36 +++++++++------
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/jukebox/sample_original_jukebox.py b/src/transformers/models/jukebox/sample_original_jukebox.py
index 549897e3fc79a..2ec4bd796529e 100644
--- a/src/transformers/models/jukebox/sample_original_jukebox.py
+++ b/src/transformers/models/jukebox/sample_original_jukebox.py
@@ -133,7 +133,7 @@ def test_sampling(model, device, tokens=40):
     upsamplers = None
 
 
-def test_prime_samling(model, device):
+def test_prime_samling(model, device, tokens=40):
     hps = Hyperparams()
     hps.device = device
     hps.sr = 44100
@@ -168,38 +168,50 @@ def test_prime_samling(model, device):
     labels = [None, None, top_prior.labeller.get_batch_labels(metas, device)]
     sampling_kwargs = get_args(model)
 
-    x = torch.rand((1, 5120, 1)).cuda()
-    vqvae.to("cuda")
-    zs = [None, None, top_prior.encode(x, start_level=2, bs_chunks=x.shape[0])[0].cuda()]
+    set_seed(0)
+    x = torch.rand((1, 5120, 1)).to(device)
+    vqvae.to(device)
+    zs = [None, None, top_prior.encode(x, start_level=2, bs_chunks=x.shape[0])[0].to(device)]
     zs = _sample(zs, labels, sampling_kwargs, [None, None, top_prior], [2], hps)
+    log_zs(zs, 2, f"primed-{model}-{device}")
 
-    if True:
-        del top_prior
-        empty_cache()
-        top_prior = None
-    upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, "cuda") for prior in priors[:-1]]
+    del top_prior
+    empty_cache()
+
+    upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, device) for prior in priors[:-1]]
     labels = [
-        upsamplers[0].labeller.get_batch_labels(metas, "cuda"),
-        upsamplers[0].labeller.get_batch_labels(metas, "cuda"),
+        upsamplers[0].labeller.get_batch_labels(metas, device),
+        upsamplers[0].labeller.get_batch_labels(metas, device),
         None,
     ]
 
+    set_seed(0)
+    hps.sample_length = tokens * upsamplers[1].raw_to_tokens
     zs = [
         None,
-        upsamplers[-1].encode(x, start_level=1, bs_chunks=x.shape[0])[0].cuda(),
-        torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long(),
+        upsamplers[-1].encode(x, start_level=1, bs_chunks=x.shape[0])[0].to(device),
+        torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).to(device)), dim=-1).long(),
     ]
     zs = _sample(zs, labels, sampling_kwargs, [None, upsamplers[1], None], [1], hps)
+    log_zs(zs, 1, f"primed-{model}-{device}")
 
+    set_seed(0)
+    hps.sample_length = tokens * upsamplers[0].raw_to_tokens
     zs = [
-        upsamplers[-1].encode(x, start_level=0, bs_chunks=x.shape[0])[0].cuda(),
-        torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[1]).cuda()), dim=-1).long(),
-        torch.zeros(1, 1000000).cuda().long(),
+        upsamplers[-1].encode(x, start_level=0, bs_chunks=x.shape[0])[0].to(device),
+        torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[1]).to(device)), dim=-1).long(),
+        torch.zeros(1, 1000000).to(device).long(),
     ]
     zs = _sample(zs, labels, sampling_kwargs, [upsamplers[0], None, None], [0], hps)
+    log_zs(zs, 0, f"primed-{model}-{device}")
 
 
-# test_sampling("1b_lyrics","cpu")
+test_sampling("1b_lyrics", "cpu")
 test_sampling("1b_lyrics", "cuda")
 test_sampling("5b_lyrics", "cpu", tokens=60)
 test_sampling("5b_lyrics", "cuda", tokens=60)
+
+test_prime_samling("1b_lyrics", "cpu")
+test_prime_samling("1b_lyrics", "cuda")
+test_prime_samling("5b_lyrics", "cpu", tokens=60)
+test_prime_samling("5b_lyrics", "cuda", tokens=60)
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 976c02517e02b..6f5c52db4bcbb 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -124,25 +124,25 @@ def prepare_inputs(self):
 
     @require_torch
     def test_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration = 0).eval()
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
         labels = self.prepare_inputs()
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=40*model.priors[-1].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_length=40*model.priors[-2].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_length=40*model.priors[-3].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
@@ -150,7 +150,7 @@ def test_sampling(self):
     def test_slow_sampling(self):
         torch.backends.cuda.matmul.allow_tf32 = False
 
-        model = JukeboxModel.from_pretrained(self.model_id,min_duration=0).eval()
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
 
         labels = [i.cuda() for i in self.prepare_inputs()]
         set_seed(0)
@@ -166,20 +166,20 @@ def test_slow_sampling(self):
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=40*model.priors[-1].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
 
         zs[-1] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1].cuda(), torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_length=40*model.priors[-2].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2].cuda(), torch.zeros(1, 1000000 - zs[-2].shape[-1]).cuda()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_length=40*model.priors[-3].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
     @slow
@@ -192,17 +192,23 @@ def test_primed_sampling(self):
         tokens = [i.cuda() for i in self.prepare_inputs()]
 
         zs = [None, None, model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0].cuda()]
-        zs = model._sample(zs, tokens, sample_levels=[2], save_results=False, sample_length=40*model.priors[-1].raw_to_tokens)
+        zs = model._sample(
+            zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[-1].raw_to_tokens
+        )
         assert torch.allclose(zs[-1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
 
         upper_2 = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
         zs = [None, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), upper_2]
-        zs = model._sample(zs, tokens, sample_levels=[1], save_results=False, sample_length=40*model.priors[-2].raw_to_tokens)
+        zs = model._sample(
+            zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[-2].raw_to_tokens
+        )
         assert torch.allclose(zs[1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_1))
 
         upper_1 = torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[-1]).cuda()), dim=-1).long()
         zs = [model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0].cuda(), upper_1, upper_2]
-        zs = model._sample(zs, tokens, sample_levels=[0], save_results=False, sample_length=40*model.priors[-3].raw_to_tokens)
+        zs = model._sample(
+            zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[-3].raw_to_tokens
+        )
         assert torch.allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
 
     @slow
@@ -276,25 +282,25 @@ def prepare_inputs(self, model_id):
 
     def test_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id,min_duration=0).eval()
+        model = JukeboxModel.from_pretrained(model_id, min_duration=0).eval()
 
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=40*model.priors[-1].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_length=40*model.priors[-2].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_length=40*model.priors[-3].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0, :40], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow

From dc42a28c51fe0794fc09f40c8cd791c97d977e97 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 17:17:56 +0000
Subject: [PATCH 065/196] finish tests

---
 tests/models/jukebox/test_modeling_jukebox.py | 154 +++++++++++-------
 1 file changed, 95 insertions(+), 59 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 6f5c52db4bcbb..c7b42d746beff 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -50,24 +50,24 @@ class Jukebox1bModelTester(unittest.TestCase):
     )
     # fmt: off
     EXPECTED_OUTPUT_2 = [
-        1434, 324, 1489, 756, 1224, 1150, 1489, 353, 2033, 1622, 1536, 519,
-        475, 1996, 1643, 701, 1229, 1434, 10, 1420, 1306, 178, 409, 2038,
-        1355, 286, 897, 1804, 253, 1643, 1685, 1769, 1002, 1597, 30, 539,
-        376, 427, 1179, 286
+        1864, 1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534,
+        1000, 1445, 1105, 1130, 967, 515, 1434, 1620, 534, 1495, 283, 1445,
+        333, 1307, 539, 1631, 1528, 375, 1434, 673, 627, 710, 778, 1883,
+        1405, 1276, 1455, 1228
     ]
 
     EXPECTED_OUTPUT_1 = [
-        1125, 1125, 904, 2037, 1125, 1274, 317, 642, 1274, 317, 851, 642,
-        642, 747, 867, 502, 502, 416, 1125, 1125, 317, 1274, 317, 1125,
-        416, 1125, 1125, 1125, 1125, 317, 855, 844, 94, 855, 502, 1714,
-        1107, 747, 1353, 573
+        1125, 1751, 697, 1776, 1141, 1476, 391, 697, 1125, 684, 867, 416,
+        844, 1372, 1274, 717, 1274, 844, 1299, 1419, 697, 1370, 317, 1125,
+        191, 1440, 1370, 1440, 1370, 282, 1621, 1370, 368, 349, 867, 1872,
+        1262, 869, 1728, 747
     ]
 
     EXPECTED_OUTPUT_0 = [
-        1755, 1061, 808, 1755, 992, 1572, 185, 491, 992, 417, 234, 89,
-        234, 417, 234, 234, 234, 417, 1638, 1638, 677, 1659, 541, 1659,
-        946, 579, 992, 556, 844, 329, 926, 556, 293, 579, 946, 1659,
-        1562, 579, 1372, 290
+        1755, 842, 307, 1843, 1022, 1395, 234, 1554, 806, 739, 1022, 442,
+        616, 556, 268, 1499, 933, 457, 1440, 1837, 755, 985, 308, 902,
+        293, 1443, 1671, 1141, 1533, 555, 1562, 1061, 287, 417, 1022, 2008,
+        1186, 1015, 1777, 268
     ]
 
     EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
@@ -90,13 +90,6 @@ class Jukebox1bModelTester(unittest.TestCase):
         269, 35, 1699, 241, 1369, 35, 1303, 583, 825, 1941, 1089, 1944,
         581, 35, 1153, 1153
     ]
-    EXPECTED_VQVAE = [
-        -0.0168, -0.0083, -0.0062, -0.0078, -0.0095, -0.0108, -0.0117, -0.0124,
-        -0.0138, -0.0149, -0.0148, -0.0140, -0.0136, -0.0130, -0.0125, -0.0120,
-        -0.0129, -0.0148, -0.0151, -0.0138, -0.0130, -0.0129, -0.0125, -0.0116,
-        -0.0119, -0.0130, -0.0129, -0.0116, -0.0113, -0.0118, -0.0112, -0.0104,
-        -0.0114, -0.0127, -0.0122, -0.0103, -0.0083, -0.0070, -0.0060, -0.0051
-    ]
     EXPECTED_PRIMED_0 = [
         390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
         1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
@@ -115,6 +108,19 @@ class Jukebox1bModelTester(unittest.TestCase):
         1269, 1459, 1333, 1645, 812, 1577, 1337, 606, 353, 981, 1466, 619,
         197, 391, 302, 1930
     ]
+    EXPECTED_VQVAE_ENCODE= [
+        390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
+        1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
+        1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
+        1907, 1788, 596, 1626
+    ]
+    EXPECTED_VQVAE_DECODE= [
+        -0.0492, -0.0524, -0.0565, -0.0640, -0.0686, -0.0684, -0.0677, -0.0664,
+        -0.0605, -0.0490, -0.0330, -0.0168, -0.0083, -0.0075, -0.0051, 0.0025,
+        0.0136, 0.0261, 0.0386, 0.0497, 0.0580, 0.0599, 0.0583, 0.0614,
+        0.0740, 0.0889, 0.1023, 0.1162, 0.1211, 0.1212, 0.1251, 0.1336,
+        0.1502, 0.1686, 0.1883, 0.2148, 0.2363, 0.2458, 0.2507, 0.2531
+    ]
     # fmt: on
 
     def prepare_inputs(self):
@@ -122,7 +128,7 @@ def prepare_inputs(self):
         tokens = tokenizer(**self.metas)["input_ids"]
         return tokens
 
-    @require_torch
+    @slow
     def test_sampling(self):
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
         labels = self.prepare_inputs()
@@ -146,7 +152,6 @@ def test_sampling(self):
         assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
-    @require_torch
     def test_slow_sampling(self):
         torch.backends.cuda.matmul.allow_tf32 = False
 
@@ -213,16 +218,16 @@ def test_primed_sampling(self):
 
     @slow
     def test_vqvae(self):
-        zs = torch.tensor(self.EXPECTED_OUTPUT_2)
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
+        set_seed(0)
+        x = torch.rand((1,5120,1))
         with torch.no_grad():
-            x = self.vqvae.decode(zs, start_level=2, bs_chunks=zs.shape[0])
-        assert torch.allclose(x.cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
+            zs = model.vqvae.encode(x, start_level=2, bs_chunks=x.shape[0])
+        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_VQVAE_ENCODE))
 
-        zs.to("gpu")
-        self.vqvae.to("gpu")
         with torch.no_grad():
-            x = self.vqvae.decode(zs, start_level=2, bs_chunks=zs.shape[0])
-        assert torch.allclose(x.cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
+            x = model.vqvae.decode(zs, start_level=2, bs_chunks=x.shape[0])
+        assert torch.allclose(x[0,:40,0], torch.tensor(self.EXPECTED_VQVAE_DECODE),atol=1e-4)
 
 
 @require_torch
@@ -250,29 +255,51 @@ class Jukebox5bModelTester(unittest.TestCase):
 
     # fmt: off
     EXPECTED_OUTPUT_2 = [
-        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 1489, 653,
-        653, 653, 653, 653, 653, 653, 653, 653
+        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509,  6, 1272
     ]
 
     EXPECTED_OUTPUT_1 = [
-        1125, 416, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 1585, 284,
-        717, 1544, 1045, 1320, 711, 193, 1440, 1193, 416, 1125, 539, 1544,
-        593, 1274, 1181, 1658, 1181, 1145, 2037, 1125, 556, 1014, 1045, 1858,
-        1749, 1803, 1440, 1145, 416, 416, 1372, 1079, 1045, 1320, 1764, 158,
-        2020, 1543, 2037, 416, 539, 2047, 1446, 885, 1749, 2047, 118, 1348,
-        1585, 284, 529, 2047, 1228, 556, 732, 2047, 307, 1323, 2037, 1446,
-        591, 1803, 58, 591, 529, 1079, 642, 591
+        1125, 416, 1125, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 416,
+         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
     ]
 
     EXPECTED_OUTPUT_0 = [
-        1755, 1061, 234, 1755, 290, 1572, 234, 491, 992, 417, 591, 290,
-        234, 842, 589, 948, 983, 616, 1613, 1613, 290, 632, 89, 632,
-        290, 1022, 983, 1612, 1353, 581, 1353, 755, 185, 307, 632, 1979,
-        854, 1120, 1572, 719, 491, 34, 755, 632, 844, 755, 1802, 225,
-        2013, 1814, 1148, 616, 185, 1979, 1460, 983, 1168, 1613, 34, 1242,
-        632, 34, 34, 1982, 1510, 554, 983, 1784, 526, 1691, 1268, 1268,
-        290, 755, 34, 307, 222, 234, 648, 526
+       1755, 1061, 234, 1755, 1061, 1755, 185, 290, 307, 307, 616, 616,
+        616, 616, 616, 616, 307, 290, 417, 1755, 234, 1755, 185, 290,
+        290, 290, 307, 616, 616, 616, 616, 616, 290, 234, 234, 1755,
+        234, 234, 1755, 234, 185, 185, 307, 616, 616, 616, 616, 290,
+        1755, 1755, 1755, 234, 234, 1755, 1572, 290, 307, 616, 34, 616
     ]
+
+    EXPECTED_GPU_OUTPUTS_2 = [
+        1489, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
+        653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653
+    ]
+    EXPECTED_GPU_OUTPUTS_1 = [
+        1125, 1125, 416, 1125, 1125, 416, 1125, 1125, 416, 416, 1125, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
+    ]
+    EXPECTED_GPU_OUTPUTS_0 = [
+        491, 1755, 34, 1613, 1755, 417, 992, 1613, 222, 842, 1353, 1613,
+        844, 632, 185, 1613, 844, 632, 185, 1613, 185, 842, 677, 1613,
+        185, 114, 1353, 1613, 307, 89, 844, 1613, 307, 1332, 234, 1979,
+        307, 89, 1353, 616, 34, 842, 185, 842, 34, 842, 185, 842,
+        307, 114, 185, 89, 34, 1268, 185, 89, 34, 842, 185, 89
+    ]
+    
     # fmt: on
 
     def prepare_inputs(self, model_id):
@@ -280,6 +307,7 @@ def prepare_inputs(self, model_id):
         tokens = tokenizer(**self.metas)["input_ids"]
         return tokens
 
+    @slow
     def test_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
         model = JukeboxModel.from_pretrained(model_id, min_duration=0).eval()
@@ -287,40 +315,48 @@ def test_sampling(self):
         labels = self.prepare_inputs(model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
         zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[-2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-2][0, :80], torch.tensor(self.EXPECTED_OUTPUT_1))
+        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
 
         set_seed(0)
         zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
-        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[-3].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0, :40], torch.tensor(self.EXPECTED_OUTPUT_0))
+        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
-    @slow
+    # @slow
     def test_slow_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id).eval().to("cuda")
+        model = JukeboxModel.from_pretrained(model_id, min_duration=0).eval().to("cuda")
 
         labels = [i.cuda() for i in self.prepare_inputs(model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_tokens=10, save_results=False)
-        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_OUTPUT_2))
+        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+
+        zs[-1] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_2).unsqueeze(0)
+        set_seed(0)
+        zs[-1] = torch.cat((zs[-1].cuda(), torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
+        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+
+        zs[-2] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_1).unsqueeze(0)
+
+        set_seed(0)
+        zs[-2] = torch.cat((zs[-2].cuda(), torch.zeros(1, 1000000 - zs[-2].shape[-1]).cuda()), dim=-1).long()
+        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
-    def test_vqvae(self):
-        # test encoding of an audio
-        # test decoding
-        # implement vavae decoding test at 3 levels using the expected outputs
-        pass
 
 
 if __name__ == "__main__":
     tester = Jukebox1bModelTester()
-    tester.test_primed_sampling()
+    tester.test_vqvae()

From 187dbd2118179ed78c28b0f450f2913e39b72b33 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 27 Jul 2022 17:18:42 +0000
Subject: [PATCH 066/196] style

---
 tests/models/jukebox/test_modeling_jukebox.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index c7b42d746beff..fa80de63ab7d1 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -220,14 +220,14 @@ def test_primed_sampling(self):
     def test_vqvae(self):
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
         set_seed(0)
-        x = torch.rand((1,5120,1))
+        x = torch.rand((1, 5120, 1))
         with torch.no_grad():
             zs = model.vqvae.encode(x, start_level=2, bs_chunks=x.shape[0])
         assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_VQVAE_ENCODE))
 
         with torch.no_grad():
             x = model.vqvae.decode(zs, start_level=2, bs_chunks=x.shape[0])
-        assert torch.allclose(x[0,:40,0], torch.tensor(self.EXPECTED_VQVAE_DECODE),atol=1e-4)
+        assert torch.allclose(x[0, :40, 0], torch.tensor(self.EXPECTED_VQVAE_DECODE), atol=1e-4)
 
 
 @require_torch
@@ -299,7 +299,7 @@ class Jukebox5bModelTester(unittest.TestCase):
         307, 89, 1353, 616, 34, 842, 185, 842, 34, 842, 185, 842,
         307, 114, 185, 89, 34, 1268, 185, 89, 34, 842, 185, 89
     ]
-    
+
     # fmt: on
 
     def prepare_inputs(self, model_id):
@@ -356,7 +356,6 @@ def test_slow_sampling(self):
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
 
-
 if __name__ == "__main__":
     tester = Jukebox1bModelTester()
     tester.test_vqvae()

From 322c00dedaa2ec6cd93fca6589d60826b8193d40 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 28 Jul 2022 07:33:57 +0000
Subject: [PATCH 067/196] quality + slow test

---
 .../models/jukebox/sample_original_jukebox.py |  9 +++------
 tests/models/jukebox/test_modeling_jukebox.py | 20 +++++++++----------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/jukebox/sample_original_jukebox.py b/src/transformers/models/jukebox/sample_original_jukebox.py
index 2ec4bd796529e..86ebae3206363 100644
--- a/src/transformers/models/jukebox/sample_original_jukebox.py
+++ b/src/transformers/models/jukebox/sample_original_jukebox.py
@@ -1,7 +1,10 @@
 # in order to be used, the following git repo has to be used :
 # git clone --branch adaptive_device https://github.com/ArthurZucker/jukebox.git
 import os
+import random
 
+import numpy as np
+import torch
 import torch as t
 
 from jukebox.hparams import HPARAMS_REGISTRY, Hyperparams, setup_hparams
@@ -12,11 +15,6 @@
 
 
 rank, local_rank, device = setup_dist_from_mpi()
-import random
-
-import numpy as np
-import torch
-
 
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.enabled = False
@@ -130,7 +128,6 @@ def test_sampling(model, device, tokens=40):
 
     empty_cache()
     del upsamplers
-    upsamplers = None
 
 
 def test_prime_samling(model, device, tokens=40):
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index fa80de63ab7d1..b8317860c401a 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -15,7 +15,7 @@
 import unittest
 
 from transformers import is_torch_available
-from transformers.testing_utils import require_accelerate, require_torch, slow
+from transformers.testing_utils import require_torch, slow
 from transformers.trainer_utils import set_seed
 
 
@@ -108,13 +108,13 @@ class Jukebox1bModelTester(unittest.TestCase):
         1269, 1459, 1333, 1645, 812, 1577, 1337, 606, 353, 981, 1466, 619,
         197, 391, 302, 1930
     ]
-    EXPECTED_VQVAE_ENCODE= [
+    EXPECTED_VQVAE_ENCODE = [
         390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
         1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
         1516, 596, 1002, 1002, 1002, 1907, 1788, 1788, 1788, 1854, 1788, 1907,
         1907, 1788, 596, 1626
     ]
-    EXPECTED_VQVAE_DECODE= [
+    EXPECTED_VQVAE_DECODE = [
         -0.0492, -0.0524, -0.0565, -0.0640, -0.0686, -0.0684, -0.0677, -0.0664,
         -0.0605, -0.0490, -0.0330, -0.0168, -0.0083, -0.0075, -0.0051, 0.0025,
         0.0136, 0.0261, 0.0386, 0.0497, 0.0580, 0.0599, 0.0583, 0.0614,
@@ -259,19 +259,19 @@ class Jukebox5bModelTester(unittest.TestCase):
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509,  6, 1272
+        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1272
     ]
 
     EXPECTED_OUTPUT_1 = [
         1125, 416, 1125, 1125, 1125, 1125, 1125, 416, 416, 416, 416, 416,
-         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
-         416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416,
+        416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416, 416
     ]
 
     EXPECTED_OUTPUT_0 = [
-       1755, 1061, 234, 1755, 1061, 1755, 185, 290, 307, 307, 616, 616,
+        1755, 1061, 234, 1755, 1061, 1755, 185, 290, 307, 307, 616, 616,
         616, 616, 616, 616, 307, 290, 417, 1755, 234, 1755, 185, 290,
         290, 290, 307, 616, 616, 616, 616, 616, 290, 234, 234, 1755,
         234, 234, 1755, 234, 185, 185, 307, 616, 616, 616, 616, 290,
@@ -331,7 +331,7 @@ def test_sampling(self):
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
-    # @slow
+    @slow
     def test_slow_sampling(self):
         model_id = "ArthurZ/jukebox-5b-lyrics"
         model = JukeboxModel.from_pretrained(model_id, min_duration=0).eval().to("cuda")

From ba609705943ce15039c16ea167c73449e6bc5ecc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 28 Jul 2022 12:43:26 +0000
Subject: [PATCH 068/196] starting code refactoring and renaming

---
 .../models/jukebox/configuration_jukebox.py   |   2 +
 .../models/jukebox/modeling_jukebox.py        | 175 +++++++-----------
 2 files changed, 70 insertions(+), 107 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 33423f6e934ab..26ba8b06dbaeb 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -226,6 +226,7 @@ def __init__(
         cond_m_conv=1,
         max_bow_genre_size=1,  # this should only be in the tokenizer
         name="AudioSamples",
+        init_std=0.2,
         **kwargs,
     ):
         self.name = name
@@ -353,6 +354,7 @@ def __init__(
         self.alignment_head = alignment_head
         self.m_attn = m_attn
 
+        self.init_std = init_std
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
 
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index a9fb080fd404a..5d77527bb6c2c 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -82,9 +82,9 @@ def get_range(x):
 
 
 # VQ-VAE building blocks
-class Conv1D(nn.Module):
-    def __init__(self, n_in, n_out, zero_out=False, init_scale=1.0):
-        super(Conv1D, self).__init__()
+class JukeboxConv1D(nn.Module):
+    def __init__(self, n_in, n_out, zero_out=False):
+        super(JukeboxConv1D, self).__init__()
         self.n_in = n_in
         self.n_out = n_out
         if zero_out:
@@ -93,7 +93,7 @@ def __init__(self, n_in, n_out, zero_out=False, init_scale=1.0):
             w = torch.empty(n_in, n_out)
 
         b = torch.zeros(n_out)
-        self.weight = nn.Parameter(w)  # modified self.w
+        self.weight = nn.Parameter(w)
         self.bias = nn.Parameter(b)
 
     def forward(self, x):
@@ -105,32 +105,6 @@ def forward(self, x):
         return x
 
 
-class ResConvBlock(nn.Module):
-    def __init__(self, n_in, n_state):
-        super().__init__()
-        # TODO remvove the sequential in favor of a more understanble code
-        self.model = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(n_in, n_state, 3, 1, 1),
-            nn.ReLU(),
-            nn.Conv2d(n_state, n_in, 1, 1, 0),
-        )
-
-    def forward(self, x):
-        return x + self.model(x)
-
-
-class Resnet(nn.Module):
-    def __init__(self, n_in, n_depth, m_conv=1.0):
-        super().__init__()
-        # TODO remvove the sequential in favor of a more understanble code
-        # the list comprehension is maybe not very readable
-        self.model = nn.Sequential(*[ResConvBlock(n_in, int(m_conv * n_in)) for _ in range(n_depth)])
-
-    def forward(self, x):
-        return self.model(x)
-
-
 class ResConv1DBlock(nn.Module):
     def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scale=1.0):
         super().__init__()
@@ -149,8 +123,8 @@ def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scale=1.0):
             nn.init.zeros_(out.bias)
         self.res_scale = res_scale
 
-    def forward(self, x):
-        return x + self.res_scale * self.model(x)
+    def forward(self, hidden_states):
+        return hidden_states + self.res_scale * self.model(hidden_states)
 
 
 class Resnet1D(nn.Module):
@@ -194,12 +168,12 @@ def _get_depth(depth):
         else:
             self.model = nn.Sequential(*blocks)
 
-    def forward(self, x):
+    def forward(self, hidden_states):
         if self.checkpoint_res == 1:
             for block in self.blocks:
-                x = block(x)
-            return x
-        return self.model(x)
+                hidden_states = block(hidden_states)
+            return hidden_states
+        return self.model(hidden_states)
 
 
 class EncoderConvBlock(nn.Module):
@@ -232,8 +206,8 @@ def __init__(
             blocks.append(block)
         self.model = nn.Sequential(*blocks)
 
-    def forward(self, x):
-        return self.model(x)
+    def forward(self, hidden_states):
+        return self.model(hidden_states)
 
 
 class DecoderConvBock(nn.Module):
@@ -279,8 +253,8 @@ def __init__(
                 blocks.append(block)
         self.model = nn.Sequential(*blocks)
 
-    def forward(self, x):
-        return self.model(x)
+    def forward(self, hidden_states):
+        return self.model(hidden_states)
 
 
 class Encoder(nn.Module):
@@ -312,17 +286,17 @@ def level_block(level, down_t, stride_t):
         for level, down_t, stride_t in iterator:
             self.level_blocks.append(level_block(level, down_t, stride_t))
 
-    def forward(self, x):
-        xs = []
+    def forward(self, hidden_states):
+        all_hidden_states = []
 
         # 64, 32, ...
         iterator = zip(list(range(self.levels)), self.downs_t, self.strides_t)
         for level, down_t, stride_t in iterator:
             level_block = self.level_blocks[level]
-            x = level_block(x)
-            xs.append(x)
+            hidden_states = level_block(hidden_states)
+            all_hidden_states.append(hidden_states)
 
-        return xs
+        return all_hidden_states
 
 
 class Decoder(nn.Module):
@@ -345,19 +319,19 @@ def level_block(level, down_t, stride_t):
         self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
 
     def forward(self, xs, all_levels=True):
-        x = xs[-1]
+        hidden_states = xs[-1]
 
         # 32, 64 ...
         iterator = reversed(list(zip(list(range(self.levels)), self.downs_t, self.strides_t)))
         for level, down_t, stride_t in iterator:
             level_block = self.level_blocks[level]
-            x = level_block(x)
+            hidden_states = level_block(hidden_states)
 
             if level != 0 and all_levels:
-                x = x + xs[level - 1]
+                hidden_states = hidden_states + xs[level - 1]
 
-        x = self.out(x)
-        return x
+        hidden_states = self.out(hidden_states)
+        return hidden_states
 
 
 def dont_update(params):
@@ -411,22 +385,22 @@ def reset_k(self):
         self.k_elem = None
         self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width))
 
-    def _tile(self, x):
-        d, ew = x.shape
+    def _tile(self, hidden_states):
+        d, ew = hidden_states.shape
         if d < self.k_bins:
             n_repeats = (self.k_bins + d - 1) // d
             std = 0.01 / np.sqrt(ew)
-            x = x.repeat(n_repeats, 1)
-            x = x + torch.randn_like(x) * std
-        return x
+            hidden_states = hidden_states.repeat(n_repeats, 1)
+            hidden_states = hidden_states + torch.randn_like(hidden_states) * std
+        return hidden_states
 
-    def init_k(self, x):
-        # TODO rename x to a way more meaningful name
+    def init_k(self, hidden_states):
+        # TODO rename hidden_states to a way more meaningful name
 
         _, k_bins = self.emb_width, self.k_bins  # mu,
         self.init = True
-        # init k_w using random vectors from x
-        y = self._tile(x)
+        # init k_w using random vectors from hidden_states
+        y = self._tile(hidden_states)
         _k_rand = y[torch.randperm(y.shape[0])][:k_bins]
         # dist.broadcast(_k_rand, 0)
         self.k = _k_rand
@@ -444,16 +418,16 @@ def restore_k(self, num_tokens=None, threshold=1.0):
             self.k_sum.data.mul_(expected_usage)
         self.threshold = threshold
 
-    def update_k(self, x, x_l):
+    def update_k(self, hidden_states, x_l):
         mu, emb_width, k_bins = self.mu, self.emb_width, self.k_bins
         with torch.no_grad():
             # Calculate new centres
-            x_l_onehot = torch.zeros(k_bins, x.shape[0], device=x.device)  # k_bins, N * L
-            x_l_onehot.scatter_(0, x_l.view(1, x.shape[0]), 1)
+            x_l_onehot = torch.zeros(k_bins, hidden_states.shape[0], device=hidden_states.device)  # k_bins, N * L
+            x_l_onehot.scatter_(0, x_l.view(1, hidden_states.shape[0]), 1)
 
-            _k_sum = torch.matmul(x_l_onehot, x)  # k_bins, w
+            _k_sum = torch.matmul(x_l_onehot, hidden_states)  # k_bins, w
             _k_elem = x_l_onehot.sum(dim=-1)  # k_bins
-            y = self._tile(x)
+            y = self._tile(hidden_states)
             _k_rand = y[torch.randperm(y.shape[0])][:k_bins]
 
             # Update centres
@@ -469,24 +443,23 @@ def update_k(self, x, x_l):
             dk = torch.norm(self.k - old_k) / np.sqrt(np.prod(old_k.shape))
         return dict(entropy=entropy, used_curr=used_curr, usage=usage, dk=dk)
 
-    def preprocess(self, x):
+    def preprocess(self, hidden_states):
         # NCT -> NTC -> [NT, C]
-        x = x.permute(0, 2, 1).contiguous()
-        x = x.view(-1, x.shape[-1])  # x_en = (N * L, w), k_j = (w, k_bins)
+        hidden_states = hidden_states.permute(0, 2, 1).contiguous()
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # x_en = (N * L, w), k_j = (w, k_bins)
 
-        if x.shape[-1] == self.emb_width:
-            prenorm = torch.norm(x - torch.mean(x)) / np.sqrt(np.prod(x.shape))
-        elif x.shape[-1] == 2 * self.emb_width:
-            x1, x2 = x[..., : self.emb_width], x[..., self.emb_width :]
+        if hidden_states.shape[-1] == self.emb_width:
+            prenorm = torch.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(np.prod(hidden_states.shape))
+        elif hidden_states.shape[-1] == 2 * self.emb_width:
+            x1, x2 = hidden_states[..., : self.emb_width], hidden_states[..., self.emb_width :]
             prenorm = (torch.norm(x1 - torch.mean(x1)) / np.sqrt(np.prod(x1.shape))) + (
                 torch.norm(x2 - torch.mean(x2)) / np.sqrt(np.prod(x2.shape))
             )
 
             # Normalise
-            x = x1 + x2
-        else:
-            assert False, f"Expected {x.shape[-1]} to be (1 or 2) * {self.emb_width}"
-        return x, prenorm
+            hidden_states = x1 + x2
+
+        return hidden_states, prenorm
 
     def postprocess(self, x_l, x_d, x_shape):
         # [NT, C] -> NTC -> NCT
@@ -511,15 +484,15 @@ def dequantise(self, x_l):
         x = F.embedding(x_l, self.k)
         return x
 
-    def encode(self, x):
-        N, width, T = x.shape
+    def encode(self, hidden_states):
+        N, _, T = hidden_states.shape
 
         # Preprocess.
-        x, prenorm = self.preprocess(x)
+        hidden_states, _ = self.preprocess(hidden_states)
         # TODO remvove unused prenorm variable
 
         # Quantise
-        x_l, fit = self.quantise(x)
+        x_l, _ = self.quantise(hidden_states)
         # TODO remvove unused fit and the return variable
 
         # Postprocess.
@@ -537,31 +510,31 @@ def decode(self, x_l):
         x_d = x_d.view(N, T, width).permute(0, 2, 1).contiguous()
         return x_d
 
-    def forward(self, x, update_k=True):
-        N, width, T = x.shape
+    def forward(self, hidden_states, update_k=True):
+        N, width, T = hidden_states.shape
 
         # Preprocess
-        x, prenorm = self.preprocess(x)
+        hidden_states, prenorm = self.preprocess(hidden_states)
 
         # Init k if not inited
         if update_k and not self.init:
-            self.init_k(x)
+            self.init_k(hidden_states)
 
         # Quantise and dequantise through bottleneck
-        x_l, fit = self.quantise(x)
+        x_l, fit = self.quantise(hidden_states)
         x_d = self.dequantise(x_l)
 
         # Update embeddings
         if update_k:
-            update_metrics = self.update_k(x, x_l)
+            update_metrics = self.update_k(hidden_states, x_l)
         else:
             update_metrics = {}
 
         # Loss
-        commit_loss = torch.norm(x_d.detach() - x) ** 2 / np.prod(x.shape)
+        commit_loss = torch.norm(x_d.detach() - hidden_states) ** 2 / np.prod(hidden_states.shape)
 
         # Passthrough
-        x_d = x + (x_d - x).detach()
+        x_d = hidden_states + (x_d - hidden_states).detach()
 
         # Postprocess
         x_l, x_d = self.postprocess(x_l, x_d, (N, T))
@@ -572,13 +545,9 @@ class Bottleneck(nn.Module):
     def __init__(self, l_bins, emb_width, mu, levels):
         super().__init__()
         self.levels = levels
-
-        def level_block(level):
-            return BottleneckBlock(l_bins, emb_width, mu)
-
         self.level_blocks = nn.ModuleList()
         for level in range(self.levels):
-            self.level_blocks.append(level_block(level))
+            self.level_blocks.append(BottleneckBlock(l_bins, emb_width, mu))
 
     def encode(self, xs):
         zs = [level_block.encode(x) for (level_block, x) in zip(self.level_blocks, xs)]
@@ -860,12 +829,10 @@ def _multispectral_loss(x_target, x_out, hps):
         recons_loss = torch.zeros(()).to(x.device)
         spec_loss = torch.zeros(()).to(x.device)
         multispec_loss = torch.zeros(()).to(x.device)
-        # x_target = audio_postprocess(x.float(), hps)
         x_target = x.float()
 
         for level in reversed(range(self.levels)):
             x_out = self.postprocess(x_outs[level])
-            # x_out = audio_postprocess(x_out, hps)
             this_recons_loss = _loss_fn(loss_fn, x_target, x_out, hps)
             this_spec_loss = _spectral_loss(x_target, x_out, hps)
             this_multispec_loss = _multispectral_loss(x_target, x_out, hps)
@@ -914,8 +881,8 @@ class JukeboxMLP(nn.Module):
     def __init__(self, width, n_state, resid_dropout=0.0, afn="gelu", zero_out=False, init_scale=1.0):
         # a single channel is always used in original code
         super().__init__()
-        self.c_fc = Conv1D(width, n_state, init_scale=init_scale)
-        self.c_proj = Conv1D(n_state, width, zero_out, init_scale=init_scale)
+        self.c_fc = JukeboxConv1D(width, n_state)
+        self.c_proj = JukeboxConv1D(n_state, width, zero_out)
         self.act = ACT2FN[afn]
         self.dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
 
@@ -1006,11 +973,11 @@ def __init__(
         self.scale = scale
         self.mask = mask
         if attn_func == 6:
-            self.c_attn = Conv1D(width, n_state, init_scale=init_scale)
-            self.c_enc_kv = Conv1D(width, n_state * 2, init_scale=init_scale)
+            self.c_attn = JukeboxConv1D(width, n_state)
+            self.c_enc_kv = JukeboxConv1D(width, n_state * 2)
         else:
-            self.c_attn = Conv1D(width, n_state * 3, init_scale=init_scale)
-        self.c_proj = Conv1D(n_state, width, zero_out, init_scale=init_scale)
+            self.c_attn = JukeboxConv1D(width, n_state * 3)
+        self.c_proj = JukeboxConv1D(n_state, width, zero_out)
         self.attn_dropout = nn.Dropout(attn_dropout) if attn_dropout > 0.0 else lambda x: x
         self.resid_dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
 
@@ -2563,9 +2530,7 @@ def conditioner_block(_level):
                 self.prime_prior = JukeboxConditionalAutoregressive(
                     input_shape=prime_input_shape, x_cond=False, y_cond=False, only_encode=True, **prime_kwargs
                 )
-                self.prime_state_proj = Conv1D(
-                    self.prime_acts_width, self.prime_state_width, init_scale=prime_kwargs["init_scale"]
-                )
+                self.prime_state_proj = JukeboxConv1D(self.prime_acts_width, self.prime_state_width)
                 self.prime_state_ln = LayerNorm(self.prime_state_width)
                 self.prime_bins = prime_kwargs["bins"]
                 self.prime_x_out = nn.Linear(self.prime_state_width, self.prime_bins, bias=False)
@@ -2595,7 +2560,6 @@ def conditioner_block(_level):
 
     def get_y(self, labels, start, total_length, offset, get_indices=False):
         y = labels.clone()
-        # y = labels.clone()
         y[:, 0] = total_length
         # Set sample_length to match this level
         y[:, 2] = int(self.sample_length)
@@ -2653,14 +2617,11 @@ def prior_preprocess(self, xs, conds):
     def prior_postprocess(self, z):
         N = z.shape[0]
         dims = (self.prior_dims[0], z.shape[1] - self.prior_dims[0])
-        # xs = list(t.split(z, self.prior_dims, dim=1))
         xs = list(torch.split(z, dims, dim=1))
 
         for i in range(len(xs)):
-
             shape = self.prior_shapes[i]
             _, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])  # bins, -> _,
-            # xs[i] = (xs[i] - bins_shift).view(N, *shape) #view(N, -1, *shape[1:])
             xs[i] = (xs[i] - bins_shift).view(N, -1, *shape[1:])
             xs[i] = torch.clamp(
                 xs[i], min=0

From e19774ac1635c0cda285f40c2e53d2f0bee5dba0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 28 Jul 2022 12:54:50 +0000
Subject: [PATCH 069/196] zs -> music_tokens

---
 .../models/jukebox/modeling_jukebox.py        | 144 +++++++++---------
 1 file changed, 75 insertions(+), 69 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 5d77527bb6c2c..93ea029e9eabe 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -550,24 +550,24 @@ def __init__(self, l_bins, emb_width, mu, levels):
             self.level_blocks.append(BottleneckBlock(l_bins, emb_width, mu))
 
     def encode(self, xs):
-        zs = [level_block.encode(x) for (level_block, x) in zip(self.level_blocks, xs)]
-        return zs
+        music_tokens = [level_block.encode(x) for (level_block, x) in zip(self.level_blocks, xs)]
+        return music_tokens
 
-    def decode(self, zs, start_level=0, end_level=None):
+    def decode(self, music_tokens, start_level=0, end_level=None):
         if end_level is None:
             end_level = self.levels
         xs_quantised = [
-            level_block.decode(z) for (level_block, z) in zip(self.level_blocks[start_level:end_level], zs)
+            level_block.decode(z) for (level_block, z) in zip(self.level_blocks[start_level:end_level], music_tokens)
         ]
         return xs_quantised
 
     def forward(self, xs):
-        zs, xs_quantised, commit_losses, metrics = [], [], [], []
+        music_tokens, xs_quantised, commit_losses, metrics = [], [], [], []
         for level in range(self.levels):
             level_block = self.level_blocks[-level - 1]
             x = xs[level]
             z, x_quantised, commit_loss, metric = level_block(x, update_k=self.training)
-            zs.append(z)
+            music_tokens.append(z)
             if not self.training:
                 # Be extra paranoid and make sure the encoder weights can't
                 # change from straight-through estimator
@@ -576,7 +576,7 @@ def forward(self, xs):
             commit_losses.append(commit_loss)
             if self.training:
                 metrics.append(metric)
-        return zs, xs_quantised, commit_losses, metrics
+        return music_tokens, xs_quantised, commit_losses, metrics
 
 
 # TODO replace FFT calls with torch.fft
@@ -747,23 +747,23 @@ def postprocess(self, x):
         x = x.permute(0, 2, 1)
         return x
 
-    def _decode(self, zs, start_level=0, end_level=None):
+    def _decode(self, music_tokens, start_level=0, end_level=None):
         # Decode
         if end_level is None:
             end_level = self.levels
-        xs_quantised = self.bottleneck.decode(zs, start_level=start_level, end_level=end_level)
+        xs_quantised = self.bottleneck.decode(music_tokens, start_level=start_level, end_level=end_level)
         # Use only lowest level
         decoder, x_quantised = self.decoders[start_level], xs_quantised[0:1]
         x_out = decoder(x_quantised, all_levels=False)
         x_out = self.postprocess(x_out)
         return x_out
 
-    def decode(self, zs, start_level=0, end_level=None, bs_chunks=1):
-        z_chunks = [torch.chunk(z, bs_chunks, dim=0) for z in zs]
+    def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):
+        z_chunks = [torch.chunk(z, bs_chunks, dim=0) for z in music_tokens]
         x_outs = []
         for i in range(bs_chunks):
-            zs_i = [z_chunk[i] for z_chunk in z_chunks]
-            x_out = self._decode(zs_i, start_level=start_level, end_level=end_level)
+            music_tokens_i = [z_chunk[i] for z_chunk in z_chunks]
+            x_out = self._decode(music_tokens_i, start_level=start_level, end_level=end_level)
             x_outs.append(x_out)
         return torch.cat(x_outs, dim=0)
 
@@ -777,22 +777,22 @@ def _encode(self, x, start_level=0, end_level=None):
             encoder = self.encoders[level]
             x_out = encoder(x_in)
             xs.append(x_out[-1])
-        zs = self.bottleneck.encode(xs)
-        return zs[start_level:end_level]
+        music_tokens = self.bottleneck.encode(xs)
+        return music_tokens[start_level:end_level]
 
     def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
         x_chunks = torch.chunk(x, bs_chunks, dim=0)
-        zs_list = []
+        music_tokens_list = []
         for x_i in x_chunks:
-            zs_i = self._encode(x_i, start_level=start_level, end_level=end_level)
-            zs_list.append(zs_i)
-        zs = [torch.cat(zs_level_list, dim=0) for zs_level_list in zip(*zs_list)]
-        return zs
+            music_tokens_i = self._encode(x_i, start_level=start_level, end_level=end_level)
+            music_tokens_list.append(music_tokens_i)
+        music_tokens = [torch.cat(music_tokens_level_list, dim=0) for music_tokens_level_list in zip(*music_tokens_list)]
+        return music_tokens
 
     def sample(self, n_samples):
         # TODO handle device properly
-        zs = [torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes]
-        return self.decode(zs)
+        music_tokens = [torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes]
+        return self.decode(music_tokens)
 
     def forward(self, x, hps, loss_fn="l1"):
         metrics = {}
@@ -805,7 +805,7 @@ def forward(self, x, hps, loss_fn="l1"):
             x_out = encoder(x_in)
             xs.append(x_out[-1])
 
-        zs, xs_quantised, commit_losses, quantiser_metrics = self.bottleneck(xs)
+        music_tokens, xs_quantised, commit_losses, quantiser_metrics = self.bottleneck(xs)
         x_outs = []
         for level in range(self.levels):
             decoder = self.decoders[level]
@@ -2592,10 +2592,10 @@ def set_y_lyric_tokens(self, labels):
         else:
             return labels, None
 
-    def get_z_conds(self, zs, start, end):
+    def get_z_conds(self, music_tokens, start, end):
         if self.level != self.levels - 1:
             assert start % self.cond_downsample == end % self.cond_downsample == 0
-            z_cond = zs[self.level + 1][:, start // self.cond_downsample : end // self.cond_downsample]
+            z_cond = music_tokens[self.level + 1][:, start // self.cond_downsample : end // self.cond_downsample]
             assert z_cond.shape[1] == self.n_ctx // self.cond_downsample
             z_conds = [z_cond]
         else:
@@ -2644,17 +2644,17 @@ def encode(self, x, start_level=None, end_level=None, bs_chunks=1):
             end_level = self.levels
         # Get latents
         with torch.no_grad():
-            zs = self.encoder(x, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
-        return zs
+            music_tokens = self.encoder(x, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+        return music_tokens
 
     # same as above, the va-vae is no longer part of the prior
-    def decode(self, zs, start_level=None, end_level=None, bs_chunks=1):
+    def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
         if start_level is None:
             start_level = self.level
         if end_level is None:
             end_level = self.levels
         with torch.no_grad():
-            x_out = self.decoder(zs, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+            x_out = self.decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
         return x_out
 
     def get_cond(self, z_conds, y):
@@ -2882,10 +2882,10 @@ def save_wav(fname, lvl, metas, aud, sr):
             soundfile.write(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i], samplerate=sr, format="wav")
 
 
-def get_alignment(x, zs, labels, prior, level, fp16, hps):
+def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
     level = level - 1  # Top level used
     n_ctx, n_tokens = prior.n_ctx, prior.n_tokens
-    z = zs[level]
+    z = music_tokens[level]
     bs, total_length = z.shape[0], z.shape[1]
     if total_length < n_ctx:
         padding_length = n_ctx - total_length
@@ -2899,7 +2899,7 @@ def get_alignment(x, zs, labels, prior, level, fp16, hps):
     attn_layers = set([alignment_layer])
     alignment_hops = {}
     indices_hops = {}
-    prior.to(zs.device)
+    prior.to(music_tokens.device)
     empty_cache()
     for start in get_starts(total_length, n_ctx, hop_length):
         end = start + n_ctx
@@ -2989,9 +2989,9 @@ def __init__(self, config):
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
     # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
-    def sample_partial_window(self, zs, labels, offset, sampling_kwargs, level, tokens_to_sample, hps):
+    def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample, hps):
         prior = self.priors[level]
-        z = zs[level]
+        z = music_tokens[level]
         n_ctx = prior.n_ctx
         current_tokens = z.shape[1]
         if current_tokens < n_ctx - tokens_to_sample:
@@ -3001,10 +3001,10 @@ def sample_partial_window(self, zs, labels, offset, sampling_kwargs, level, toke
             sampling_kwargs["sample_tokens"] = n_ctx
             start = current_tokens - n_ctx + tokens_to_sample
 
-        return self.sample_single_window(zs, labels, offset, sampling_kwargs, level, start, hps)
+        return self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, hps)
 
     # Sample a single window of length=n_ctx at position=start on level=level
-    def sample_single_window(self, zs, labels, offset, sampling_kwargs, level, start, hps):
+    def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start, hps):
         prior = self.priors[level]
         n_samples = hps.n_samples
         n_ctx = prior.n_ctx
@@ -3013,7 +3013,7 @@ def sample_single_window(self, zs, labels, offset, sampling_kwargs, level, start
         # the tokenizer, as [total_length, offset, sample_length] can be written on the fly and changed without changing the
         # lyric tokens.
         # get z already sampled at current level
-        z = zs[level][:, start:end]
+        z = music_tokens[level][:, start:end]
 
         if "sample_tokens" in sampling_kwargs:
             # Support sampling a window shorter than n_ctx
@@ -3032,10 +3032,10 @@ def sample_single_window(self, zs, labels, offset, sampling_kwargs, level, start
 
         if new_tokens <= 0:
             # Nothing new to sample
-            return zs
+            return music_tokens
 
         # get z_conds from level above
-        z_conds = prior.get_z_conds(zs, start, end)
+        z_conds = prior.get_z_conds(music_tokens, start, end)
         # if there are no levels above should return None!
 
         # set y offset, sample_length and lyrics okens
@@ -3059,25 +3059,25 @@ def sample_single_window(self, zs, labels, offset, sampling_kwargs, level, start
 
         # Update z with new sample
         z_new = z[:, -new_tokens:]
-        zs[level] = torch.cat([zs[level], z_new], dim=1)
-        return zs
+        music_tokens[level] = torch.cat([music_tokens[level], z_new], dim=1)
+        return music_tokens
 
     # Sample total_length tokens at level=level with hop_length=hop_length
-    def sample_level(self, zs, labels, offset, sampling_kwargs, level, total_length, hop_length, hps):
+    def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length, hps):
         # print(f"Sampling level {level}")
         print(f"Sampling level {level}")
         if total_length >= self.priors[level].n_ctx:
             for start in get_range(get_starts(total_length, self.priors[level].n_ctx, hop_length)):
-                zs = self.sample_single_window(zs, labels, offset, sampling_kwargs, level, start, hps)
+                music_tokens = self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, hps)
 
         else:
-            zs = self.sample_partial_window(zs, labels, offset, sampling_kwargs, level, total_length, hps)
-        return zs
+            music_tokens = self.sample_partial_window(music_tokens, labels, offset, sampling_kwargs, level, total_length, hps)
+        return music_tokens
 
     # Sample multiple levels
     def _sample(
         self,
-        zs,
+        music_tokens,
         labels,
         sample_levels,
         metas=None,
@@ -3130,64 +3130,70 @@ def _sample(
             sample_levels = range(len(self.priors))
         for level in reversed(sample_levels):
             self.total_length = sampling_kwargs[level].pop("total_length")
-            self.priors[level].to(zs[level].device).eval()
+            self.priors[level].to(music_tokens[level].device).eval()
             empty_cache()
             hps.sample_length = self.total_length  # generated length of the signal
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
             total_length = hps.sample_length // self.priors[level].raw_to_tokens
             hop_length = int(hps.hop_fraction[-level - 1] * self.priors[level].n_ctx)
 
-            zs = self.sample_level(
-                zs, labels[level], offset, sampling_kwargs[level], level, total_length, hop_length, hps
+            music_tokens = self.sample_level(
+                music_tokens, labels[level], offset, sampling_kwargs[level], level, total_length, hop_length, hps
             )
 
             self.priors[level].to("cpu")
             empty_cache()
-            self.vqvae.to(zs[level].device)
+            self.vqvae.to(music_tokens[level].device)
             # Decode sample
             with torch.no_grad():
-                x = self.vqvae.decode(zs[level:], start_level=level, bs_chunks=zs[level].shape[0])
+                raw_audio = self.vqvae.decode(
+                    music_tokens[level:], start_level=level, bs_chunks=music_tokens[level].shape[0]
+                )
             self.vqvae.to("cpu")
 
             if save_results:
                 logdir = f"{self.start_time}/level_{level}"
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
-                torch.save(dict(zs=zs, labels=labels, sampling_kwargs=sampling_kwargs, x=x), f"{logdir}/data.pth.tar")
-                save_wav(logdir, level, metas=metas, aud=x, sr=hps.sr)
+                # torch.save(dict(music_tokens=music_tokens, labels=labels, sampling_kwargs=sampling_kwargs, raw_audio=raw_audio), f"{logdir}/data.pth.tar")
+                save_wav(logdir, level, metas=metas, aud=raw_audio, sr=hps.sr)
                 if (
                     alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
                 ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):
                     # either use level which will be the given lovel or use the total nb of levels?
-                    # alignments = get_alignment(x, zs, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
+                    # alignments = get_alignment(x, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
                     pass  # TODO this is a really dirty fix
-        return zs
+        return music_tokens
 
     # Generate ancestral samples given a list of artists and genres
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
-        zs = [torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))]
-        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
-        return zs
+        music_tokens = [
+            torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))
+        ]
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
 
     # Continue ancestral sampling from previously saved codes
-    def continue_sample(self, zs, labels, **sampling_kwargs):
+    def continue_sample(self, music_tokens, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
-        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
-        return zs
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
 
     # Upsample given already generated upper-level codes
-    def upsample(self, zs, labels, **sampling_kwargs):
+    def upsample(self, music_tokens, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors) - 1))
-        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
-        return zs
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
 
     # Prompt the model with raw audio input (dimension: NTC) and generate continuations
-    def primed_sample(self, x, labels, **sampling_kwargs):
+    def primed_sample(self, raw_audio, labels, **sampling_kwargs):
         sample_levels = list(range(len(self.priors)))
-        self.vqvae.to(x.device)
+        self.vqvae.to(raw_audio.device)
         with torch.no_grad():
-            zs = self.vqvae.encode(x, start_level=0, end_level=len(self.priors), bs_chunks=x.shape[0])
+            music_tokens = self.vqvae.encode(
+                raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.shape[0]
+            )
         self.vqvae.to("cpu")
-        zs = self._sample(zs, labels, sample_levels, **sampling_kwargs)
-        return zs
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens

From 81e8fbe970a2b46689884844e322207a3682b4f1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 28 Jul 2022 13:22:45 +0000
Subject: [PATCH 070/196] update

---
 .../models/jukebox/modeling_jukebox.py        | 106 ++++++++++--------
 1 file changed, 58 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 93ea029e9eabe..9f72e47cffb77 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -786,12 +786,16 @@ def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
         for x_i in x_chunks:
             music_tokens_i = self._encode(x_i, start_level=start_level, end_level=end_level)
             music_tokens_list.append(music_tokens_i)
-        music_tokens = [torch.cat(music_tokens_level_list, dim=0) for music_tokens_level_list in zip(*music_tokens_list)]
+        music_tokens = [
+            torch.cat(music_tokens_level_list, dim=0) for music_tokens_level_list in zip(*music_tokens_list)
+        ]
         return music_tokens
 
     def sample(self, n_samples):
         # TODO handle device properly
-        music_tokens = [torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes]
+        music_tokens = [
+            torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes
+        ]
         return self.decode(music_tokens)
 
     def forward(self, x, hps, loss_fn="l1"):
@@ -2818,6 +2822,17 @@ class JukeboxPreTrainedModel(PreTrainedModel):
     config_class = JukeboxConfig
     base_model_prefix = "transformer"
 
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
@@ -2862,26 +2877,6 @@ def get_starts(total_length, n_ctx, hop_length):
     return starts
 
 
-def save_wav(fname, lvl, metas, aud, sr):
-    import soundfile
-
-    # clip before saving?
-    aud = torch.clamp(aud, -1, 1).cpu().numpy()
-    for i in list(range(aud.shape[0])):
-        if metas is not None:
-            artists, genres, lyrics = list(
-                metas[i].values()
-            )  # twitter prompts or inputs are in the form of a dictionnary
-            soundfile.write(
-                f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav",
-                aud[i],
-                samplerate=sr,
-                format="wav",
-            )
-        else:
-            soundfile.write(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i], samplerate=sr, format="wav")
-
-
 def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
     level = level - 1  # Top level used
     n_ctx, n_tokens = prior.n_ctx, prior.n_tokens
@@ -2906,15 +2901,13 @@ def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
 
         # set y offset, sample_length and lyrics tokens
         y, indices_hop = prior.get_y(labels, start, total_length, get_indices=True)
-        for indices in indices_hop:
-            assert len(indices) == n_tokens
 
         z_bs = torch.chunk(z, bs, dim=0)
         y_bs = torch.chunk(y, bs, dim=0)
         w_hops = []
         for z_i, y_i in zip(z_bs, y_bs):
             w_hop = prior.z_forward(z_i[:, start:end], [], y_i, fp16=fp16, get_attn_weights=attn_layers)
-            assert len(w_hop) == 1
+
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
         w = torch.cat(w_hops, dim=0)
@@ -2940,35 +2933,48 @@ def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
             end = start + n_ctx
             alignment_hop = alignment_hops[start][item]
             indices = indices_hops[start][item]
-            assert len(indices) == n_tokens
-            assert alignment_hop.shape == (n_ctx, n_tokens)
+
             alignment[start:end, indices] = alignment_hop
         alignment = alignment[: total_length - padding_length, :-1]  # remove token padding, and last lyric index
         alignments.append(alignment)
     return alignments
 
 
+def save_wav(fname, lvl, metas, aud, sr):
+    import soundfile
+
+    aud = torch.clamp(aud, -1, 1).cpu().numpy()
+    for i in list(range(aud.shape[0])):
+        if metas is not None:
+            # twitter prompts or inputs are in the form of a dictionnary
+            artists, genres, lyrics = list(metas[i].values())
+            path = f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav"
+            soundfile.write(path, aud[i], samplerate=sr, format="wav")
+        else:
+            soundfile.write(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i], samplerate=sr, format="wav")
+
+
 def load_audio(file, sr, offset, duration, mono=False):
     import librosa
 
     # Librosa loads more filetypes than soundfile
-    x, _ = librosa.load(file, sr=sr, mono=mono, offset=offset / sr, duration=duration / sr)
-    if len(x.shape) == 1:
-        x = x.reshape((1, -1))
-    return x
+    raw_audio, _ = librosa.load(file, sr=sr, mono=mono, offset=offset / sr, duration=duration / sr)
+    if len(raw_audio.shape) == 1:
+        raw_audio = raw_audio.reshape((1, -1))
+    return raw_audio
 
 
 def load_prompts(audio_files, duration, hps):
-    xs = []
+    raw_audio_list = []
     for audio_file in audio_files:
-        x = load_audio(audio_file, sr=hps.sr, duration=duration, offset=0.0, mono=True)
-        x = x.T  # CT -> TC
-        xs.append(x)
-    while len(xs) < hps.n_samples:
-        xs.extend(xs)
-    xs = xs[: hps.n_samples]
-    x = torch.stack([torch.from_numpy(x) for x in xs])
-    return x
+        raw_audio = load_audio(audio_file, sr=hps.sr, duration=duration, offset=0.0, mono=True)
+        raw_audio = raw_audio.T  # CT -> TC
+        raw_audio_list.append(raw_audio)
+    while len(raw_audio_list) < hps.n_samples:
+        raw_audio_list.extend(raw_audio_list)
+    raw_audio_list = raw_audio_list[: hps.n_samples]
+    raw_audio = torch.stack([torch.from_numpy(raw_audio) for raw_audio in raw_audio_list])
+    return raw_audio
 
 
 @add_start_docstrings(
@@ -3009,11 +3015,8 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         n_samples = hps.n_samples
         n_ctx = prior.n_ctx
         end = start + n_ctx
-
-        # the tokenizer, as [total_length, offset, sample_length] can be written on the fly and changed without changing the
-        # lyric tokens.
         # get z already sampled at current level
-        z = music_tokens[level][:, start:end]
+        previous_sampled_tokens = music_tokens[level][:, start:end]
 
         if "sample_tokens" in sampling_kwargs:
             # Support sampling a window shorter than n_ctx
@@ -3023,7 +3026,10 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
 
         else:
             sample_tokens = end - start
-        conditioning_tokens, new_tokens = z.shape[1], sample_tokens - z.shape[1]
+        conditioning_tokens, new_tokens = (
+            previous_sampled_tokens.shape[1],
+            sample_tokens - previous_sampled_tokens.shape[1],
+        )
 
         print(
             f"Sampling {sample_tokens} tokens for [{start},{start+sample_tokens}]. Conditioning on"
@@ -3046,7 +3052,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         max_batch_size = sampling_kwargs["max_batch_size"]
         del sampling_kwargs["max_batch_size"]
 
-        z_list = split_batch(z, n_samples, max_batch_size)
+        z_list = split_batch(previous_sampled_tokens, n_samples, max_batch_size)
         z_conds_list = split_batch(z_conds, n_samples, max_batch_size)
         y_list = split_batch(y, n_samples, max_batch_size)
         z_samples = []
@@ -3068,10 +3074,14 @@ def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, tot
         print(f"Sampling level {level}")
         if total_length >= self.priors[level].n_ctx:
             for start in get_range(get_starts(total_length, self.priors[level].n_ctx, hop_length)):
-                music_tokens = self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, hps)
+                music_tokens = self.sample_single_window(
+                    music_tokens, labels, offset, sampling_kwargs, level, start, hps
+                )
 
         else:
-            music_tokens = self.sample_partial_window(music_tokens, labels, offset, sampling_kwargs, level, total_length, hps)
+            music_tokens = self.sample_partial_window(
+                music_tokens, labels, offset, sampling_kwargs, level, total_length, hps
+            )
         return music_tokens
 
     # Sample multiple levels

From 49dbf08f4eaf608654456de1f2eeee184e91607e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 29 Jul 2022 08:19:32 +0000
Subject: [PATCH 071/196] fixe save wav and get_alignment

---
 .../models/jukebox/modeling_jukebox.py        | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 9f72e47cffb77..335f166db67d0 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2880,12 +2880,12 @@ def get_starts(total_length, n_ctx, hop_length):
 def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
     level = level - 1  # Top level used
     n_ctx, n_tokens = prior.n_ctx, prior.n_tokens
-    z = music_tokens[level]
-    bs, total_length = z.shape[0], z.shape[1]
+    tokens = music_tokens[level]
+    bs, total_length = tokens.shape[0], tokens.shape[1]
     if total_length < n_ctx:
         padding_length = n_ctx - total_length
-        z = torch.cat([z, torch.zeros(bs, n_ctx - total_length, dtype=z.dtype, device=z.device)], dim=1)
-        total_length = z.shape[1]
+        tokens = torch.cat([tokens, torch.zeros(bs, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1)
+        total_length = tokens.shape[1]
     else:
         padding_length = 0
 
@@ -2894,19 +2894,19 @@ def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
     attn_layers = set([alignment_layer])
     alignment_hops = {}
     indices_hops = {}
-    prior.to(music_tokens.device)
+    prior.to(tokens.device)
     empty_cache()
     for start in get_starts(total_length, n_ctx, hop_length):
         end = start + n_ctx
 
         # set y offset, sample_length and lyrics tokens
-        y, indices_hop = prior.get_y(labels, start, total_length, get_indices=True)
+        y, indices_hop = prior.get_y(labels, start, total_length, get_indices=True,offset=0)
 
-        z_bs = torch.chunk(z, bs, dim=0)
+        tokens_bs = torch.chunk(tokens, bs, dim=0)
         y_bs = torch.chunk(y, bs, dim=0)
         w_hops = []
-        for z_i, y_i in zip(z_bs, y_bs):
-            w_hop = prior.z_forward(z_i[:, start:end], [], y_i, fp16=fp16, get_attn_weights=attn_layers)
+        for tokens_i, y_i in zip(tokens_bs, y_bs):
+            w_hop = prior.z_forward(tokens_i[:, start:end], [], y_i, fp16=fp16, get_attn_weights=attn_layers)
 
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
@@ -2947,8 +2947,8 @@ def save_wav(fname, lvl, metas, aud, sr):
     for i in list(range(aud.shape[0])):
         if metas is not None:
             # twitter prompts or inputs are in the form of a dictionnary
-            artists, genres, lyrics = list(metas[i].values())
-            path = f"{fname}/lvl_{lvl}-{artists[i]}-{genres[i]}-{lyrics[i][:5]}{i}.wav"
+            artists, genres, lyrics =  list(metas)[i].values()
+            path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}.wav"
             soundfile.write(path, aud[i], samplerate=sr, format="wav")
         else:
             soundfile.write(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i], samplerate=sr, format="wav")
@@ -3169,9 +3169,8 @@ def _sample(
                 save_wav(logdir, level, metas=metas, aud=raw_audio, sr=hps.sr)
                 if (
                     alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
-                ):  # and not isinstance(self.priors[-1].labeller, Empty`Labeller`):
-                    # either use level which will be the given lovel or use the total nb of levels?
-                    # alignments = get_alignment(x, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
+                ): 
+                    alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
                     pass  # TODO this is a really dirty fix
         return music_tokens
 

From 37969d695b7761fbe45330199163d3d552a5d53e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 29 Jul 2022 09:34:04 +0000
Subject: [PATCH 072/196] update alignment

---
 src/transformers/models/jukebox/modeling_jukebox.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 335f166db67d0..77db5e3b34d90 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2900,7 +2900,7 @@ def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
         end = start + n_ctx
 
         # set y offset, sample_length and lyrics tokens
-        y, indices_hop = prior.get_y(labels, start, total_length, get_indices=True,offset=0)
+        y, indices_hop = prior.get_y(labels, start, hps.sample_length, get_indices=True,offset=0)
 
         tokens_bs = torch.chunk(tokens, bs, dim=0)
         y_bs = torch.chunk(y, bs, dim=0)
@@ -3171,7 +3171,7 @@ def _sample(
                     alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
                 ): 
                     alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
-                    pass  # TODO this is a really dirty fix
+                    pass  # consumes too much ram
         return music_tokens
 
     # Generate ancestral samples given a list of artists and genres

From f21c32bb79e4faad0b41c4b2b7201a1d56477502 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 29 Jul 2022 09:34:24 +0000
Subject: [PATCH 073/196] update names and doc

---
 .../models/jukebox/modeling_jukebox.py        | 55 ++++++++-----------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 335f166db67d0..aed8a7967dcdd 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -47,9 +47,9 @@
 
 logger = logging.get_logger(__name__)
 
-# _CHECKPOINT_FOR_DOC = "ArthurZ/jukebox-dummy"
-# _CONFIG_FOR_DOC = "JukeboxConfig"
-# _TOKENIZER_FOR_DOC = "JukeboxTokenizer"
+_CHECKPOINT_FOR_DOC = "ArthurZ/jukebox-dummy"
+_CONFIG_FOR_DOC = "JukeboxConfig"
+_TOKENIZER_FOR_DOC = "JukeboxTokenizer"
 
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "ArthurZ/jukebox-dummy",
@@ -2982,37 +2982,34 @@ def load_prompts(audio_files, duration, hps):
     JUKEBOX_START_DOCSTRING,
 )
 class JukeboxModel(JukeboxPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
     _no_split_modules = ["JukeboxBlock"]
 
     def __init__(self, config):
         super().__init__(config)
-
         self.embed_dim = config.hidden_size
-
         self.vqvae = JukeboxVQVAE(config)
         config.vqvae_z_shapes = self.vqvae.z_shapes
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
     # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
-    def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample, hps):
+    def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample):
         prior = self.priors[level]
-        z = music_tokens[level]
+        sampled_tokens = music_tokens[level]
         n_ctx = prior.n_ctx
-        current_tokens = z.shape[1]
-        if current_tokens < n_ctx - tokens_to_sample:
-            sampling_kwargs["sample_tokens"] = current_tokens + tokens_to_sample
+        nb_sampled_tokens = sampled_tokens.shape[1]
+        if nb_sampled_tokens < n_ctx - tokens_to_sample:
+            sampling_kwargs["sample_tokens"] = nb_sampled_tokens + tokens_to_sample
             start = 0
         else:
             sampling_kwargs["sample_tokens"] = n_ctx
-            start = current_tokens - n_ctx + tokens_to_sample
+            start = nb_sampled_tokens - n_ctx + tokens_to_sample
 
-        return self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, hps)
+        return self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start)
 
     # Sample a single window of length=n_ctx at position=start on level=level
-    def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start, hps):
+    def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start):
         prior = self.priors[level]
-        n_samples = hps.n_samples
+        n_samples = self.config.n_samples
         n_ctx = prior.n_ctx
         end = start + n_ctx
         # get z already sampled at current level
@@ -3040,15 +3037,14 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
             # Nothing new to sample
             return music_tokens
 
-        # get z_conds from level above
+        # get z_conds from level above TODO rename to latent_cond? Or music_token_cond
         z_conds = prior.get_z_conds(music_tokens, start, end)
         # if there are no levels above should return None!
 
-        # set y offset, sample_length and lyrics okens
-        y = prior.get_y(labels, start, self.total_length, offset)
+        # set y offset, sample_length and lyrics okens TODO rename to lyric_cond
+        y = prior.get_y(labels, start, self.config.sample_length, offset)
 
         empty_cache()
-        max_batch_size = 2
         max_batch_size = sampling_kwargs["max_batch_size"]
         del sampling_kwargs["max_batch_size"]
 
@@ -3069,18 +3065,17 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         return music_tokens
 
     # Sample total_length tokens at level=level with hop_length=hop_length
-    def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length, hps):
-        # print(f"Sampling level {level}")
+    def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length):
         print(f"Sampling level {level}")
         if total_length >= self.priors[level].n_ctx:
             for start in get_range(get_starts(total_length, self.priors[level].n_ctx, hop_length)):
                 music_tokens = self.sample_single_window(
-                    music_tokens, labels, offset, sampling_kwargs, level, start, hps
+                    music_tokens, labels, offset, sampling_kwargs, level, start
                 )
 
         else:
             music_tokens = self.sample_partial_window(
-                music_tokens, labels, offset, sampling_kwargs, level, total_length, hps
+                music_tokens, labels, offset, sampling_kwargs, level, total_length
             )
         return music_tokens
 
@@ -3115,7 +3110,6 @@ def _sample(
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
-                total_length=total_length,
             ),
             dict(
                 temp=0.99,
@@ -3123,7 +3117,6 @@ def _sample(
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
-                total_length=total_length,
             ),
             dict(
                 temp=sampling_temperature,
@@ -3131,24 +3124,22 @@ def _sample(
                 max_batch_size=max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
-                total_length=total_length,
             ),
         ]
-        hps = self.config
         self.start_time = time.strftime("%Y-%m-%d-%Hh%M")
         if sample_levels is None:
             sample_levels = range(len(self.priors))
         for level in reversed(sample_levels):
-            self.total_length = sampling_kwargs[level].pop("total_length")
+            self.config.sample_length = total_length  # total length of the signal, might be bit different 
+            # from the actual generated length
             self.priors[level].to(music_tokens[level].device).eval()
             empty_cache()
-            hps.sample_length = self.total_length  # generated length of the signal
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
-            total_length = hps.sample_length // self.priors[level].raw_to_tokens
-            hop_length = int(hps.hop_fraction[-level - 1] * self.priors[level].n_ctx)
+            total_length = self.config.sample_length // self.priors[level].raw_to_tokens
+            hop_length = int(self.config.hop_fraction[-level - 1] * self.priors[level].n_ctx)
 
             music_tokens = self.sample_level(
-                music_tokens, labels[level], offset, sampling_kwargs[level], level, total_length, hop_length, hps
+                music_tokens, labels[level], offset, sampling_kwargs[level], level, total_length, hop_length
             )
 
             self.priors[level].to("cpu")

From cfbfdffe7ef2442877d7401da9f6750961ebd1a6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 2 Aug 2022 15:48:24 +0000
Subject: [PATCH 074/196] clean resconv1d block : from seq to module list

---
 .../models/jukebox/modeling_jukebox.py        | 332 ++++--------------
 1 file changed, 68 insertions(+), 264 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 8d2cff02f37e7..accd6b5df48bb 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -109,24 +109,19 @@ class ResConv1DBlock(nn.Module):
     def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scale=1.0):
         super().__init__()
         padding = dilation
-        # TODO remvove the sequential in favor of a more understanble code
-        self.model = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv1d(n_in, n_state, 3, 1, padding, dilation),
-            nn.ReLU(),
-            nn.Conv1d(n_state, n_in, 1, 1, 0),
-        )
-        # TODO remvove the initialisation scheme
-        if zero_out:
-            out = self.model[-1]
-            nn.init.zeros_(out.weight)
-            nn.init.zeros_(out.bias)
+        self.relu = nn.ReLU()
+        self.conv1d_1 = nn.Conv1d(n_in, n_state, 3, 1, padding, dilation)
+        self.conv1d_2 = nn.Conv1d(n_state, n_in, 1, 1, 0)
         self.res_scale = res_scale
 
     def forward(self, hidden_states):
-        return hidden_states + self.res_scale * self.model(hidden_states)
-
-
+        residuals = hidden_states
+        hidden_states = self.relu(hidden_states)
+        hidden_states = self.conv1d_1(hidden_states)
+        hidden_states = self.relu(hidden_states)
+        hidden_states = self.conv1d_2(hidden_states)
+        return residuals + self.res_scale * hidden_states
+        
 class Resnet1D(nn.Module):
     def __init__(
         self,
@@ -148,18 +143,17 @@ def _get_depth(depth):
             else:
                 return depth % dilation_cycle
 
-        # TODO remvove comprehension in favor of a for loop more understanbnle
-
-        blocks = [
-            ResConv1DBlock(
-                n_in,
-                int(m_conv * n_in),
-                dilation=dilation_growth_rate ** _get_depth(depth),
-                zero_out=zero_out,
-                res_scale=1.0 if not res_scale else 1.0 / math.sqrt(n_depth),
+        blocks = []
+        for depth in range(n_depth) : 
+            blocks.append(ResConv1DBlock(
+                    n_in,
+                    int(m_conv * n_in),
+                    dilation=dilation_growth_rate ** _get_depth(depth),
+                    zero_out=zero_out,
+                    res_scale=1.0 if not res_scale else 1.0 / math.sqrt(n_depth),
+                )
             )
-            for depth in range(n_depth)
-        ]
+
         self.checkpoint_res = checkpoint_res
         if reverse_dilation:
             blocks = blocks[::-1]
@@ -233,6 +227,7 @@ def __init__(
             filter_t, pad_t = stride_t * 2, stride_t // 2
             block = nn.Conv1d(output_emb_width, width, 3, 1, 1)
             blocks.append(block)
+            # TODO replace with modulelists
             for i in range(down_t):
                 block = nn.Sequential(
                     Resnet1D(
@@ -280,7 +275,6 @@ def level_block(level, down_t, stride_t):
             )
 
         self.level_blocks = nn.ModuleList()
-        # TODO remvove iterator
 
         iterator = zip(list(range(self.levels)), downs_t, strides_t)
         for level, down_t, stride_t in iterator:
@@ -347,7 +341,7 @@ def update(params):
 def calculate_strides(strides, downs):
     return [stride**down for stride, down in zip(strides, downs)]
 
-
+# TODO Remove losses
 def _loss_fn(loss_fn, x_target, x_pred, hps):
     if loss_fn == "l1":
         return torch.mean(torch.abs(x_pred - x_target)) / hps.bandwidth["l1"]
@@ -395,7 +389,7 @@ def _tile(self, hidden_states):
         return hidden_states
 
     def init_k(self, hidden_states):
-        # TODO rename hidden_states to a way more meaningful name
+        # TODO rename y, k_bins and k to a way more meaningful name
 
         _, k_bins = self.emb_width, self.k_bins  # mu,
         self.init = True
@@ -792,7 +786,6 @@ def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
         return music_tokens
 
     def sample(self, n_samples):
-        # TODO handle device properly
         music_tokens = [
             torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes
         ]
@@ -898,10 +891,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# TODO rename to JukeboxLayerNorm
-
-
-class LayerNorm(FusedLayerNorm):
+class JukeboxLayerNorm(FusedLayerNorm):
     def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
         super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
         self.width = np.prod(normalized_shape)
@@ -911,7 +901,7 @@ def forward(self, input):
         if input.numel() > self.max_numel:
             return F.layer_norm(input.float(), self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
         else:
-            return super(LayerNorm, self).forward(input.float()).type_as(input)
+            return super(JukeboxLayerNorm, self).forward(input.float()).type_as(input)
 
 
 def repeat(x, n, dim):
@@ -1229,10 +1219,6 @@ def factored_qkv(self, x, encoder_kv=None, sample=False):
                     query = self._pad_to_block_ctx(query, query=True)
                     key = self._pad_to_block_ctx(key)
                     value = self._pad_to_block_ctx(value)
-                    assert key.shape[1] % self.block_ctx == 0
-                    assert query.shape[1] % self.block_ctx == 0
-                assert key.shape[1] == value.shape[1]
-                assert query.shape[1] <= key.shape[1]
                 sample = False
             else:
                 key = self.cache["key"]
@@ -1278,7 +1264,6 @@ def forward(self, x, encoder_kv=None, sample=False):
     @property
     def _prime_len(self):
         prime_len = self.prime_len
-        assert prime_len is not None
         prime_blocks = (prime_len // self.blocks) + 1
         return prime_blocks * self.blocks
 
@@ -1353,42 +1338,7 @@ def del_cache(self):
         if "value" in self.cache:
             del self.cache["value"]
         self.cache = {}
-
-    def check(self):
-        blocks = self.blocks or 1
-        spread = self.spread or 1
-        bs, l, d = (4, self.n_ctx, self.width)
-        x = torch.randn(bs, l, d).cpu()
-        x.requires_grad = True
-        x_out = self.forward(x)  # bs, l, d
-        loss = x_out.mean(dim=-1)  # bs, l
-        pos = 60
-        grad = torch.autograd.grad(loss[2, pos], x)[0]
-
-        assert grad.shape == (bs, l, d)
-        assert (grad[:2] == 0).all()
-        assert (grad[3:] == 0).all()
-        assert (grad[2, (pos + 1) :] == 0).all()
-        pos_grad = (torch.sum(grad[2] ** 2, dim=-1) > 0).nonzero().view(-1).cpu()
-
-        block_pos = pos - (pos % (l // blocks))
-        exp_pos_grad = {
-            0: torch.arange(pos),
-            1: torch.arange(block_pos, pos),
-            2: torch.arange(pos % (l // blocks), pos, l // blocks),
-            3: torch.arange(block_pos - l // blocks, block_pos),
-            4: torch.arange(l // blocks - 1, pos, l // blocks),
-            5: ((torch.arange(pos) % (l // blocks) >= (l // blocks - spread)) & (torch.arange(pos) < block_pos))
-            .nonzero()
-            .view(-1),
-        }[self.attn_func]
-        exp_pos_grad = torch.cat([exp_pos_grad, torch.tensor([pos])], dim=-1)
-
-        assert (len(pos_grad) == len(exp_pos_grad)) and (pos_grad == exp_pos_grad).all(), (
-            f"Expected pos grad {exp_pos_grad} got {pos_grad} for attn_func {self.attn_func} pos {pos} l {l} blocks"
-            f" {blocks}"
-        )
-
+        
     def check_cache(self, n_samples, sample_t, fp16):
         assert self.sample_t == sample_t, f"{self.sample_t} != {sample_t}"
         if sample_t == 0:
@@ -1401,86 +1351,6 @@ def check_cache(self, n_samples, sample_t, fp16):
             assert self.cache["key"].dtype == dtype, f"Expected {dtype}, got {self.cache['key'].dtype}"
             assert self.cache["value"].dtype == dtype, f"Expected {dtype}, got {self.cache['value'].dtype}"
 
-    def check_sample(self):
-        torch.manual_seed(42)
-        bs, l, d = (4, self.n_ctx, self.width)
-        prime = 5
-        x = torch.randn(bs, l, d).cpu()
-        xs = torch.chunk(x, l, dim=1)
-        assert self.sample_t == 0
-        assert self.cache == {}
-
-        with torch.no_grad():
-            enc_l = self.encoder_dims
-            encoder_kv = None
-            if self.attn_func == 6:
-                encoder_kv = torch.randn(bs, enc_l, d).cpu()
-
-            # Normal path
-            x_out_normal = self.forward(x, encoder_kv=encoder_kv)
-
-            # Sampling path
-            x_out_sample = torch.cat(
-                [self.forward(xs[i], encoder_kv=encoder_kv, sample=True) for i in range(l)], dim=1
-            )
-        max_err = torch.max(torch.abs(x_out_sample - x_out_normal))
-        assert max_err < 1e-8, (
-            "Max sampling err is"
-            f" {max_err} {[i for i in range(l) if torch.max(torch.abs(x_out_sample - x_out_normal)[:,i,:]) > 1e-8]}"
-        )
-
-        with torch.no_grad():
-            x_out_normal = x_out_normal[:, :prime, :]
-            # Prime sampling path
-            self.del_cache()
-            x_out_sample = self.forward(x[:, :prime, :].contiguous(), encoder_kv=encoder_kv, sample=True)
-            self.check_cache(bs, prime, False)
-
-        max_err = torch.max(torch.abs(x_out_sample - x_out_normal))
-        assert max_err < 1e-8, (
-            "Max prime sampling err is"
-            f" {max_err} {[i for i in range(prime) if torch.max(torch.abs(x_out_sample - x_out_normal)[:,i,:]) > 1e-8]}"
-        )
-
-    def check_chunks(self, chunk_size):
-        torch.manual_seed(42)
-        bs, l, d = (4, self.n_ctx, self.width)
-        enc_l = self.encoder_dims
-        assert l % chunk_size == 0
-        n_chunks = l // chunk_size
-        with torch.no_grad():
-            encoder_kv = None
-            x = torch.randn(bs, l, d).cpu()
-            if self.attn_func == 6:
-                encoder_kv = torch.randn(bs, enc_l, d).cpu()
-
-            self.del_cache()
-            y_forw = self.forward(x, encoder_kv=encoder_kv, sample=False)
-            self.del_cache()
-            y_forw_sample = self.forward(x, encoder_kv=encoder_kv, sample=True)
-            max_err = torch.max(torch.abs(y_forw - y_forw_sample))
-            assert max_err <= 1e-6, (
-                "Max err is"
-                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(y_forw - y_forw_sample)[:, i, :]) > 1e-6]}"
-            )
-
-            self.del_cache()
-            x_chunks = torch.chunk(x, n_chunks, dim=1)
-            y_chunks = []
-            total_len = 0
-            for x_chunk in x_chunks:
-                y_chunk = self.forward(x_chunk.contiguous(), encoder_kv=encoder_kv, sample=True)
-                total_len += x_chunk.shape[1]
-                self.check_cache(bs, total_len, False)
-                y_chunks.append(y_chunk)
-            y_forw_in_chunks = torch.cat(y_chunks, dim=1)
-
-            max_err = torch.max(torch.abs(y_forw - y_forw_in_chunks))
-            assert max_err <= 1e-6, (
-                "Max err is"
-                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(y_forw - y_forw_in_chunks)[:, i, :]) > 1e-6]}"
-            )
-
 
 class JukeboxBlock(nn.Module):
     # previously ResAttnBlock
@@ -1526,7 +1396,8 @@ def __init__(
             encoder_dims=encoder_dims,
             prime_len=prime_len,
         )
-        self.ln_0 = LayerNorm(width)
+        
+        self.ln_0 = JukeboxLayerNorm(width)
         self.mlp = JukeboxMLP(
             width=width,
             n_state=int(m_mlp * width),
@@ -1535,9 +1406,10 @@ def __init__(
             zero_out=zero_out,
             init_scale=init_scale,
         )
-        self.ln_1 = LayerNorm(width)
+        self.ln_1 = JukeboxLayerNorm(width)
         self.res_scale = res_scale
 
+        # TODO either support checkpointing for faster inference or get rid of this 
         self.checkpoint_attn = checkpoint_attn
         self.checkpoint_mlp = checkpoint_mlp
         self.width = width
@@ -1693,34 +1565,7 @@ def del_cache(self):
         for l in self._attn_mods:
             l.attn.del_cache()
 
-    def check_sample(self):
-        bs, l, s, d = (4, self.n_ctx, self.encoder_dims, self.width)
-        # prime = 5
-        with torch.no_grad():
-            encoder_kv = torch.randn(bs, s, d).cpu()
-            x = torch.randn(bs, l, d).cpu()
-            y_forw = self.forward(x, encoder_kv=encoder_kv, sample=True)
-
-            self.del_cache()
-            x_chunks = torch.chunk(x, 4, dim=1)
-            y_chunks = []
-            n = 0
-            for x_chunk in x_chunks:
-                self.check_cache(bs, n, False)
-                y_chunk = self.forward(x_chunk, encoder_kv=encoder_kv, sample=True)
-                y_chunks.append(y_chunk)
-                n += x_chunk.shape[1]
-            self.check_cache(bs, n, False)
-            y_forw_in_chunks = torch.cat(y_chunks, dim=1)
-
-            max_err = torch.max(torch.abs(y_forw - y_forw_in_chunks))
-            assert max_err <= 1e-6, (
-                "Max err is"
-                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(y_forw - y_forw_in_chunks)[:, i, :]) > 1e-6]}"
-            )
-
-
-class PositionEmbedding(nn.Module):
+class JukeboxPositionalEmbedding(nn.Module):
     def __init__(self, input_shape, width, init_scale=1.0, pos_init=False):
         super().__init__()
         self.input_shape = input_shape
@@ -1772,19 +1617,22 @@ def __init__(
         self.input_shape = input_shape
         self.input_dims = input_dims = np.prod(input_shape)
         self.encoder_dims = encoder_dims
+        # TODO rename bins
         self.bins = bins
         self.width = width
         self.depth = depth
 
+        # TODO  rename x to proper name
         self.x_emb = nn.Embedding(bins, width)
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
         self.x_emb_dropout = nn.Dropout(emb_dropout)
+        # TODO rename y and y_cond to proper names
         self.y_cond = y_cond
         self.x_cond = x_cond
         if not y_cond:
             self.start_token = nn.Parameter(get_normal(1, width, std=0.01 * init_scale))
 
-        self.pos_emb = PositionEmbedding(
+        self.pos_emb = JukeboxPositionalEmbedding(
             input_shape=input_shape, width=width, init_scale=init_scale, pos_init=pos_init
         )
         self.pos_emb_dropout = nn.Dropout(emb_dropout)
@@ -1844,7 +1692,8 @@ def postprocess(self, x, sample_tokens=None):
             return x.view(N, *self.input_shape)
         else:
             return x.view(N, -1)
-
+        
+    # TODO RENAME x, x_cond and y_cond, x_prime, x_gen, x_t
     def forward(
         self,
         x,
@@ -1904,6 +1753,7 @@ def forward(
         else:
             return loss, None
 
+    # TODO rename x, x_conds, y_conds
     def get_emb(self, sample_t, n_samples, x, x_cond, y_cond):
         N, D = n_samples, self.input_dims
         if sample_t == 0:
@@ -1914,15 +1764,14 @@ def get_emb(self, sample_t, n_samples, x, x_cond, y_cond):
                 x[:, 0] = self.start_token
         else:
             x = self.x_emb(x)
-        assert x.shape == (n_samples, 1, self.width)
         if x_cond.shape == (N, D, self.width):
             cond = x_cond[:, sample_t : sample_t + 1, :]
         else:
             cond = x_cond
         x = x + self.pos_emb()[sample_t : sample_t + 1] + cond  # Pos emb, dropout is identity at eval time
-        assert x.shape == (n_samples, 1, self.width)
         return x, cond
-
+    
+    # TODO rename x, x_conds, y_conds
     def sample(
         self,
         n_samples,
@@ -1946,41 +1795,40 @@ def sample(
             )
 
         with torch.no_grad():
-            xs, x = [], None
+            sampled_tokens, tokens = [], None
             if get_preds:
                 preds = []
 
             for sample_t in get_range(range(0, sample_tokens)):
 
-                x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
+                hidden_states, cond = self.get_emb(sample_t, n_samples, tokens, x_cond, y_cond)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
-                x = self.transformer(
-                    x, encoder_kv=encoder_kv, sample=True, fp16=fp16
+                hidden_states = self.transformer(
+                    hidden_states, encoder_kv=encoder_kv, sample=True, fp16=fp16
                 )  # TODO put fp16 back # Transformer
                 if self.add_cond_after_transformer:
-                    x = x + cond
-                assert x.shape == (n_samples, 1, self.width)
-                x = self.x_out(x)  # Predictions
+                    hidden_states= hidden_states + cond
+                hidden_states = self.x_out(hidden_states)  # Predictions
                 if get_preds:
-                    preds.append(x.clone())
+                    preds.append(hidden_states.clone())
                 # Adjust logits
-                x = x / temp
-                x = filter_logits(x, top_k=top_k, top_p=top_p)
-                x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
-                assert x.shape == (n_samples, 1)
-                xs.append(x.clone())
-            del x
+                hidden_states = hidden_states / temp
+                hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
+                tokens = torch.distributions.Categorical(logits=hidden_states).sample()  # Sample and replace x
+                sampled_tokens.append(tokens.clone())
+            del tokens
             self.transformer.del_cache()
 
-            x = torch.cat(xs, dim=1)
+            tokens = torch.cat(sampled_tokens, dim=1)
             if get_preds:
                 preds = torch.cat(preds, dim=1)
-            x = self.postprocess(x, sample_tokens)
+            tokens = self.postprocess(tokens, sample_tokens)
         if get_preds:
-            return x, preds
+            return tokens, preds
         else:
-            return x
+            return tokens
 
+    # TODO rename all
     def primed_sample(
         self,
         n_samples,
@@ -2059,7 +1907,6 @@ def primed_sample(
             self.transformer.check_cache(n_samples, len(xs), fp16)
 
             x = xs[-1]
-            assert x.shape == (n_samples, 1)
             empty_cache()
 
             for sample_t in get_range(range(len(xs), sample_tokens)):
@@ -2090,46 +1937,6 @@ def primed_sample(
         else:
             return x
 
-    def check_sample(self, chunk_size):
-        bs, l, d = (4, self.input_dims, self.width)
-        prime = int(self.input_dims // 8 * 7)
-        enc_l = self.encoder_dims
-        with torch.no_grad():
-            y_cond = torch.randn(bs, 1, d).cpu() if self.y_cond else None
-            x_cond = torch.randn(bs, l, d).cpu() if self.x_cond else None
-            encoder_kv = torch.randn(bs, enc_l, d).cpu()
-
-            x, preds_sample = self.sample(bs, x_cond, y_cond, encoder_kv, get_preds=True)
-            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
-            max_err = torch.max(torch.abs(preds_sample - preds_forw))
-            assert max_err <= 1e-6, (
-                "Max err is"
-                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
-            )
-
-            x_prime = x.view(bs, -1)[:, :prime]
-            # unchunked
-            x, preds_sample = self.primed_sample(bs, x_prime.clone(), x_cond, y_cond, encoder_kv, get_preds=True)
-            assert (x.view(bs, -1)[:, :prime] == x_prime).all(), "Priming samples don't match"
-            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
-            max_err = torch.max(torch.abs(preds_sample - preds_forw))
-            assert max_err <= 1e-6, (
-                "Max err is"
-                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
-            )
-
-            # chunked
-            x, preds_sample = self.primed_sample(
-                bs, x_prime.clone(), x_cond, y_cond, encoder_kv, get_preds=True, chunk_size=chunk_size
-            )
-            assert (x.view(bs, -1)[:, :prime] == x_prime).all(), "Priming samples don't match"
-            loss, preds_forw = self.forward(x, x_cond, y_cond, encoder_kv, get_preds=True)
-            max_err = torch.max(torch.abs(preds_sample - preds_forw))
-            assert max_err <= 1e-6, (
-                "Max err is"
-                f" {max_err} {[i for i in range(l) if torch.max(torch.abs(preds_sample - preds_forw)[:, i, :]) > 1e-6]}"
-            )
-
 
 def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
     """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
@@ -2182,9 +1989,6 @@ def split_chunks(length, chunk_size):
     return chunk_sizes
 
 
-# Conditioners
-
-
 class MusicTokenConditioner(nn.Module):
     """
     The MusicTokenConditioner takes music tokens as an input (coresponding to vocabularies in the VQ-VAE codebook) and
@@ -2205,11 +2009,11 @@ def __init__(
         self.x_emb = nn.Embedding(bins, out_width)
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
 
-        # MusicTokenConditioner, takes as input either uper level tokens or raw audio? #TODO check that
+        # MusicTokenConditioner, takes as input either uper level tokens or raw audio?
         self.cond = DecoderConvBock(
             self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale
         )
-        self.ln = LayerNorm(self.width)
+        self.ln = JukeboxLayerNorm(self.width)
 
     def preprocess(self, x):
         x = x.permute(0, 2, 1)  # NTC -> NCT
@@ -2219,6 +2023,7 @@ def postprocess(self, x):
         x = x.permute(0, 2, 1)  # NCT -> NTC
         return x
 
+    # TODO rename to raw audio and hidden states
     def forward(self, x, x_cond=None):
         if x_cond is None:
             x_cond = 0.0
@@ -2249,7 +2054,7 @@ def __init__(self, bins, out_width, init_scale):
         super().__init__()
         self.bins = bins
         self.emb = nn.Embedding(bins, out_width)
-        nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
+        # nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
 
     def forward(self, y):
         return self.emb(y)
@@ -2304,7 +2109,7 @@ def forward(self, pos_start, pos_end=None):
         bins = (self.bins * normalised_position).floor().long().detach()  # [0,1) -> [0,1..,bins) -> [0,1...,bins-1]
         return self.emb(bins)
 
-
+# TODO rename y_bins and t_bins as well as y
 class LabelConditioner(nn.Module):
     def __init__(
         self,
@@ -2322,7 +2127,6 @@ def __init__(
         super().__init__()
         self.n_time = n_time
         self.out_width = out_width
-        assert len(y_bins) == 2, f"Expecting (genre, artist) bins, got {y_bins}"
         bow_genre_bins, artist_bins = y_bins
         self.max_bow_genre_size = max_bow_genre_size
         self.bow_genre_emb = SimpleEmbedding(bow_genre_bins, out_width, init_scale)
@@ -2365,7 +2169,7 @@ def forward(self, y):
             pos_emb = None
         return start_emb, pos_emb
 
-
+# TODO rename every conditioning 
 class JukeboxPrior(nn.Module):
     """
     Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
@@ -2535,7 +2339,7 @@ def conditioner_block(_level):
                     input_shape=prime_input_shape, x_cond=False, y_cond=False, only_encode=True, **prime_kwargs
                 )
                 self.prime_state_proj = JukeboxConv1D(self.prime_acts_width, self.prime_state_width)
-                self.prime_state_ln = LayerNorm(self.prime_state_width)
+                self.prime_state_ln = JukeboxLayerNorm(self.prime_state_width)
                 self.prime_bins = prime_kwargs["bins"]
                 self.prime_x_out = nn.Linear(self.prime_state_width, self.prime_bins, bias=False)
                 nn.init.normal_(self.prime_x_out.weight, std=0.02 * prior_kwargs["init_scale"])
@@ -2890,7 +2694,7 @@ def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
         padding_length = 0
 
     hop_length = int(hps.hop_fraction[-level - 1] * prior.n_ctx)
-    alignment_head, alignment_layer = hps.alignment_head[-level - 1], hps.alignment_layer[-level - 1]
+    alignment_head, alignment_layer = hps.alignment_head[0], hps.alignment_layer[0]
     attn_layers = set([alignment_layer])
     alignment_hops = {}
     indices_hops = {}
@@ -2907,7 +2711,6 @@ def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
         w_hops = []
         for tokens_i, y_i in zip(tokens_bs, y_bs):
             w_hop = prior.z_forward(tokens_i[:, start:end], [], y_i, fp16=fp16, get_attn_weights=attn_layers)
-
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
         w = torch.cat(w_hops, dim=0)
@@ -3157,11 +2960,12 @@ def _sample(
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
                 # torch.save(dict(music_tokens=music_tokens, labels=labels, sampling_kwargs=sampling_kwargs, raw_audio=raw_audio), f"{logdir}/data.pth.tar")
-                save_wav(logdir, level, metas=metas, aud=raw_audio, sr=hps.sr)
+                save_wav(logdir, level, metas=metas, aud=raw_audio, sr=self.config.sr)
                 if (
                     alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
                 ): 
-                    alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], hps)
+                    empty_cache()
+                    #alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], self.config)
                     pass  # consumes too much ram
         return music_tokens
 

From d01fbf8bc0358a26ac5d438d83e00b74310eb18c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 5 Aug 2022 06:00:24 +0000
Subject: [PATCH 075/196] major VQVAE refactoring everything is modulist +
 simplified naming

---
 .../models/jukebox/modeling_jukebox.py        | 81 +++++++------------
 1 file changed, 27 insertions(+), 54 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index accd6b5df48bb..54837289a4fbe 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -157,18 +157,13 @@ def _get_depth(depth):
         self.checkpoint_res = checkpoint_res
         if reverse_dilation:
             blocks = blocks[::-1]
-        if self.checkpoint_res == 1:
-            self.blocks = nn.ModuleList(blocks)
-        else:
-            self.model = nn.Sequential(*blocks)
+        self.resnet_block = nn.ModuleList(blocks)
+        
 
     def forward(self, hidden_states):
-        if self.checkpoint_res == 1:
-            for block in self.blocks:
-                hidden_states = block(hidden_states)
-            return hidden_states
-        return self.model(hidden_states)
-
+        for block in self.resnet_block:
+            hidden_states = block(hidden_states)
+        return hidden_states
 
 class EncoderConvBlock(nn.Module):
     def __init__(
@@ -190,19 +185,18 @@ def __init__(
         filter_t, pad_t = stride_t * 2, stride_t // 2
         if down_t > 0:
             for i in range(down_t):
-                # TODO remvove the sequential in favor of a more understanble code
-                block = nn.Sequential(
-                    nn.Conv1d(input_emb_width if i == 0 else width, width, filter_t, stride_t, pad_t),
-                    Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out, res_scale),
+                blocks.append(nn.Conv1d(input_emb_width if i == 0 else width, width, filter_t, stride_t, pad_t))
+                blocks.append(
+                    Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out, res_scale)
                 )
-                blocks.append(block)
-            block = nn.Conv1d(width, output_emb_width, 3, 1, 1)
-            blocks.append(block)
-        self.model = nn.Sequential(*blocks)
+        self.proj_out = nn.Conv1d(width, output_emb_width, 3, 1, 1)
+        self.downsample_block = nn.ModuleList(blocks)
 
     def forward(self, hidden_states):
-        return self.model(hidden_states)
-
+        for block in self.downsample_block:
+            hidden_states = block(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        return hidden_states
 
 class DecoderConvBock(nn.Module):
     def __init__(
@@ -225,12 +219,9 @@ def __init__(
         blocks = []
         if down_t > 0:
             filter_t, pad_t = stride_t * 2, stride_t // 2
-            block = nn.Conv1d(output_emb_width, width, 3, 1, 1)
-            blocks.append(block)
-            # TODO replace with modulelists
+            self.proj_in = nn.Conv1d(output_emb_width, width, 3, 1, 1)
             for i in range(down_t):
-                block = nn.Sequential(
-                    Resnet1D(
+                blocks.append( Resnet1D(
                         width,
                         depth,
                         m_conv,
@@ -240,18 +231,21 @@ def __init__(
                         res_scale=res_scale,
                         reverse_dilation=reverse_decoder_dilation,
                         checkpoint_res=checkpoint_res,
-                    ),
-                    nn.ConvTranspose1d(
+                    )
+                )
+                blocks.append( nn.ConvTranspose1d(
                         width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t
-                    ),
+                    )
                 )
-                blocks.append(block)
-        self.model = nn.Sequential(*blocks)
+                
+        self.upsample_block = nn.ModuleList(blocks)
 
     def forward(self, hidden_states):
-        return self.model(hidden_states)
-
-
+        hidden_states = self.proj_in(hidden_states)
+        for block in self.upsample_block:
+            hidden_states = block(hidden_states)
+        return hidden_states
+    
 class Encoder(nn.Module):
     def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t, **block_kwargs):
         super().__init__()
@@ -341,27 +335,6 @@ def update(params):
 def calculate_strides(strides, downs):
     return [stride**down for stride, down in zip(strides, downs)]
 
-# TODO Remove losses
-def _loss_fn(loss_fn, x_target, x_pred, hps):
-    if loss_fn == "l1":
-        return torch.mean(torch.abs(x_pred - x_target)) / hps.bandwidth["l1"]
-    elif loss_fn == "l2":
-        return torch.mean((x_pred - x_target) ** 2) / hps.bandwidth["l2"]
-    elif loss_fn == "linf":
-        residual = ((x_pred - x_target) ** 2).reshape(x_target.shape[0], -1)
-        values, _ = torch.topk(residual, hps.linf_k, dim=1)
-        return torch.mean(values) / hps.bandwidth["l2"]
-    elif loss_fn == "lmix":
-        loss = 0.0
-        if hps.lmix_l1:
-            loss += hps.lmix_l1 * _loss_fn("l1", x_target, x_pred, hps)
-        if hps.lmix_l2:
-            loss += hps.lmix_l2 * _loss_fn("l2", x_target, x_pred, hps)
-        if hps.lmix_linf:
-            loss += hps.lmix_linf * _loss_fn("linf", x_target, x_pred, hps)
-        return loss
-    else:
-        assert False, f"Unknown loss_fn {loss_fn}"
 
 
 class BottleneckBlock(nn.Module):

From 7913547c17071b8ee94568a4dfb82e53fa270956 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 5 Aug 2022 13:02:48 +0000
Subject: [PATCH 076/196] update

---
 .../models/jukebox/convert_jukebox.py         | 101 +++++++-
 ...kebox_original_tf_checkpoint_to_pytorch.py |  75 ------
 .../models/jukebox/modeling_jukebox.py        | 229 ++++--------------
 .../models/jukebox/tokenization_jukebox.py    |   1 -
 4 files changed, 141 insertions(+), 265 deletions(-)
 delete mode 100644 src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py

diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 95d4b6d7d578e..ac05386181f91 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -55,6 +55,97 @@ def rename_key(dct, old, new):
 }
 
 
+def fix_jukebox_keys(state_dict, model_state_dict):
+    new_dict = {}
+    for original_key, value in state_dict.items():
+        key = original_key
+        wo_model = key.split("model")
+        if len(wo_model) == 2 and "encoders" in key:
+            if len(wo_model[1].split(".")) <= 3:
+                key = wo_model[0] + "proj_out." + wo_model[1].split(".")[-1]
+            else:
+                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]))
+                key = (
+                    wo_model[0] + "downsample_block." + block_index + "." + wo_model[1].split(".")[-1]
+                )
+        elif len(wo_model) == 2 and "decoders" in key:
+            if len(wo_model[1].split(".")) <= 3:
+                key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
+            else:
+                block_index = str(
+                    int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
+                )
+                key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
+        elif len(wo_model) == 2 and "cond.model." in key:
+            if len(wo_model[1].split(".")) <= 3:
+                key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
+            else:
+                block_index = str(
+                    int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
+                )
+                key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
+        elif len(wo_model) == 3 and "priors" in key:
+            # should also rename cond to low_lvl_conditioner
+            block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
+            key = (
+                wo_model[0]
+                + "upsample_block."
+                + block_index
+                + ".resnet_block."
+                + wo_model[1].split(".")[-2]
+                + ".model"
+                + wo_model[2]
+            )
+        elif len(wo_model) == 4 and "decoders" in key:
+            # convert from
+            # model.1.0 is the first upsample block's resnet layer. Then this
+            # layer has resnet_blocks (1 to 3) which has a sequential (last model). 3 is the 3nd conv
+            # vqvae.decoders.0.level_blocks.0.model.1.0.model.1.model.3.bias
+            # to
+            # vqvae.decoders.1.level_blocks.0.upsample_block.1.resnet_blocks.2.conv1d_2.weight
+            block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
+            key = (
+                wo_model[0]
+                + "upsample_block."
+                + block_index
+                + ".resnet_block."
+                + wo_model[2].split(".")[1]
+                + ".model"
+                + wo_model[3]
+            )
+        elif len(wo_model) == 4 and "encoders" in key:
+            block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]))
+            key = (
+                wo_model[0]
+                + "downsample_block."
+                + block_index
+                + ".resnet_block."
+                + wo_model[2].split(".")[1]
+                + ".model"
+                + wo_model[3]
+            )
+
+        if key.endswith(".model.1.bias") and len(key.split(".")) > 10:
+            key = key.replace(".model.1.bias", ".conv1d_1.bias")
+        elif key.endswith(".model.1.weight") and len(key.split(".")) > 10:
+            key = key.replace(".model.1.weight", ".conv1d_1.weight")
+        elif key.endswith(".model.3.bias") and len(key.split(".")) > 10:
+            key = key.replace(".model.3.bias", ".conv1d_2.bias")
+        elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
+            key = key.replace(".model.3.weight", ".conv1d_2.weight")
+
+        if key not in model_state_dict.keys():
+            print(f"failed converting {original_key} to {key}, does not match")
+        elif value.shape != model_state_dict[key].shape:
+            print(
+                f"{original_key}-> {key} : \nshape {model_state_dict[key].shape} and { value.shape},"
+                " do not match"
+            )
+            key = original_key
+        new_dict[key] = value
+    return new_dict
+
+
 @torch.no_grad()
 def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     """
@@ -69,9 +160,14 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         "model"
     ]
 
+    config = JukeboxConfig.from_pretrained(model_name)
+    model = JukeboxModel(config)
+    
     weight_dict = []
     for dict_name in priors:
         old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}")["model"]
+        
+
         new_dic = {}
         for k in old_dic.keys():
             if k.endswith(".b"):
@@ -82,10 +178,11 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
                 new_dic[k.replace(".blocks.", ".model.")] = old_dic[k]
             else:
                 new_dic[k] = old_dic[k]
+                
+        new_dic = fix_jukebox_keys(new_dic,model.state_dict())
         weight_dict.append(new_dic)
 
-    config = JukeboxConfig.from_pretrained(model_name)
-    model = JukeboxModel(config)
+    
 
     model.vqvae.load_state_dict(vqvae_dic)
     for i in range(len(weight_dict)):
diff --git a/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
deleted file mode 100644
index 4b417038d3040..0000000000000
--- a/src/transformers/models/jukebox/convert_jukebox_original_tf_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert OpenAI GPT checkpoint."""
-
-
-import argparse
-
-import torch
-
-from transformers import JukeboxConfig, JukeboxModel
-from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
-
-
-logging.set_verbosity_info()
-
-
-def convert_jukebox_checkpoint_to_pytorch(jukebox_checkpoint_path, jukebox_config_file, pytorch_dump_folder_path):
-    # Construct model
-    if jukebox_config_file == "":
-        config = JukeboxConfig()
-    else:
-        config = JukeboxConfig.from_json_file(jukebox_config_file)
-    model = JukeboxModel(config)
-
-    # Load weights from numpy
-    # load_tf_weights_in_jukebox(model, config, jukebox_checkpoint_path)
-
-    # Save pytorch-model
-    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
-    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
-    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
-    torch.save(model.state_dict(), pytorch_weights_dump_path)
-    print(f"Save configuration file to {pytorch_config_dump_path}")
-    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
-        f.write(config.to_json_string())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--jukebox_checkpoint_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to the TensorFlow checkpoint path.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
-    )
-    parser.add_argument(
-        "--jukebox_config_file",
-        default="",
-        type=str,
-        help=(
-            "An optional config json file corresponding to the pre-trained OpenAI model. \n"
-            "This specifies the model architecture."
-        ),
-    )
-    args = parser.parse_args()
-    # convert_jukebox_checkpoint_to_pytorch(
-    #     args.jukebox_checkpoint_path, args.jukebox_config_file, args.pytorch_dump_folder_path
-    # )
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 54837289a4fbe..eb6b88f583445 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -121,7 +121,8 @@ def forward(self, hidden_states):
         hidden_states = self.relu(hidden_states)
         hidden_states = self.conv1d_2(hidden_states)
         return residuals + self.res_scale * hidden_states
-        
+
+
 class Resnet1D(nn.Module):
     def __init__(
         self,
@@ -144,8 +145,9 @@ def _get_depth(depth):
                 return depth % dilation_cycle
 
         blocks = []
-        for depth in range(n_depth) : 
-            blocks.append(ResConv1DBlock(
+        for depth in range(n_depth):
+            blocks.append(
+                ResConv1DBlock(
                     n_in,
                     int(m_conv * n_in),
                     dilation=dilation_growth_rate ** _get_depth(depth),
@@ -158,13 +160,13 @@ def _get_depth(depth):
         if reverse_dilation:
             blocks = blocks[::-1]
         self.resnet_block = nn.ModuleList(blocks)
-        
 
     def forward(self, hidden_states):
         for block in self.resnet_block:
             hidden_states = block(hidden_states)
         return hidden_states
 
+
 class EncoderConvBlock(nn.Module):
     def __init__(
         self,
@@ -198,6 +200,7 @@ def forward(self, hidden_states):
         hidden_states = self.proj_out(hidden_states)
         return hidden_states
 
+
 class DecoderConvBock(nn.Module):
     def __init__(
         self,
@@ -221,7 +224,8 @@ def __init__(
             filter_t, pad_t = stride_t * 2, stride_t // 2
             self.proj_in = nn.Conv1d(output_emb_width, width, 3, 1, 1)
             for i in range(down_t):
-                blocks.append( Resnet1D(
+                blocks.append(
+                    Resnet1D(
                         width,
                         depth,
                         m_conv,
@@ -233,11 +237,12 @@ def __init__(
                         checkpoint_res=checkpoint_res,
                     )
                 )
-                blocks.append( nn.ConvTranspose1d(
+                blocks.append(
+                    nn.ConvTranspose1d(
                         width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t
                     )
                 )
-                
+
         self.upsample_block = nn.ModuleList(blocks)
 
     def forward(self, hidden_states):
@@ -245,7 +250,8 @@ def forward(self, hidden_states):
         for block in self.upsample_block:
             hidden_states = block(hidden_states)
         return hidden_states
-    
+
+
 class Encoder(nn.Module):
     def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t, **block_kwargs):
         super().__init__()
@@ -335,8 +341,7 @@ def update(params):
 def calculate_strides(strides, downs):
     return [stride**down for stride, down in zip(strides, downs)]
 
-
-
+# rename TODO
 class BottleneckBlock(nn.Module):
     def __init__(self, k_bins, emb_width, mu):
         super().__init__()
@@ -435,12 +440,12 @@ def postprocess(self, x_l, x_d, x_shape):
         x_l = x_l.view(N, T)
         return x_l, x_d
 
-    def quantise(self, x):
+    def quantise(self, hidden_states):
         # Calculate latent code x_l
         k_w = self.k.t()
         distance = (
-            torch.sum(x**2, dim=-1, keepdim=True)
-            - 2 * torch.matmul(x, k_w)
+            torch.sum(hidden_states**2, dim=-1, keepdim=True)
+            - 2 * torch.matmul(hidden_states, k_w)
             + torch.sum(k_w**2, dim=0, keepdim=True)
         )  # (N * L, b)
         min_distance, x_l = torch.min(distance, dim=-1)
@@ -456,11 +461,9 @@ def encode(self, hidden_states):
 
         # Preprocess.
         hidden_states, _ = self.preprocess(hidden_states)
-        # TODO remvove unused prenorm variable
 
         # Quantise
         x_l, _ = self.quantise(hidden_states)
-        # TODO remvove unused fit and the return variable
 
         # Postprocess.
         x_l = x_l.view(N, T)
@@ -546,94 +549,6 @@ def forward(self, xs):
         return music_tokens, xs_quantised, commit_losses, metrics
 
 
-# TODO replace FFT calls with torch.fft
-def stft(sig, hps):
-    return torch.stft(
-        sig,
-        hps.n_fft,
-        hps.hop_length,
-        win_length=hps.window_size,
-        window=torch.hann_window(hps.window_size, device=sig.device),
-    )
-
-
-# TODO replace spec
-def spec(x, hps):
-    return torch.norm(stft(x, hps), p=2, dim=-1)
-
-
-# TODO check if can be removed
-
-
-class DefaultSTFTValues:
-    def __init__(self, hps):
-        self.sr = hps.sr
-        self.n_fft = 2048
-        self.hop_length = 256
-        self.window_size = 6 * self.hop_length
-
-
-def norm(x):
-    return (x.view(x.shape[0], -1) ** 2).sum(dim=-1).sqrt()
-
-
-def squeeze(x):
-    if len(x.shape) == 3:
-        assert x.shape[-1] in [1, 2]
-        x = torch.mean(x, -1)
-    if len(x.shape) != 2:
-        raise ValueError(f"Unknown input shape {x.shape}")
-    return x
-
-
-def spectral_loss(x_in, x_out, hps):
-    hps = DefaultSTFTValues(hps)
-    spec_in = spec(squeeze(x_in.float()), hps)
-    spec_out = spec(squeeze(x_out.float()), hps)
-    return norm(spec_in - spec_out)
-
-
-def spectral_convergence(x_in, x_out, hps, epsilon=2e-3):
-    hps = DefaultSTFTValues(hps)
-    spec_in = spec(squeeze(x_in.float()), hps)
-    spec_out = spec(squeeze(x_out.float()), hps)
-
-    gt_norm = norm(spec_in)
-    residual_norm = norm(spec_in - spec_out)
-    mask = (gt_norm > epsilon).float()
-    return (residual_norm * mask) / torch.clamp(gt_norm, min=epsilon)
-
-
-class STFTValues:
-    def __init__(self, hps, n_fft, hop_length, window_size):
-        self.sr = hps.sr
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.window_size = window_size
-
-
-def multispectral_loss(x_in, x_out, hps):
-    losses = []
-    assert len(hps.multispec_loss_n_fft) == len(hps.multispec_loss_hop_length) == len(hps.multispec_loss_window_size)
-    args = [hps.multispec_loss_n_fft, hps.multispec_loss_hop_length, hps.multispec_loss_window_size]
-    for n_fft, hop_length, window_size in zip(*args):
-        hps = STFTValues(hps, n_fft, hop_length, window_size)
-        spec_in = spec(squeeze(x_in.float()), hps)
-        spec_out = spec(squeeze(x_out.float()), hps)
-        losses.append(norm(spec_in - spec_out))
-    return sum(losses) / len(losses)
-
-
-def average_metrics(_metrics):
-    metrics = {}
-    for _metric in _metrics:
-        for key, val in _metric.items():
-            if key not in metrics:
-                metrics[key] = []
-            metrics[key].append(val)
-    return {key: sum(vals) / len(vals) for key, vals in metrics.items()}
-
-
 class JukeboxVQVAE(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -764,9 +679,8 @@ def sample(self, n_samples):
         ]
         return self.decode(music_tokens)
 
-    def forward(self, x, hps, loss_fn="l1"):
-        metrics = {}
-
+    # TODO rename
+    def forward(self, x, hps):
         # Encode/Decode
         x_in = self.preprocess(x)
         xs = []
@@ -782,68 +696,13 @@ def forward(self, x, hps, loss_fn="l1"):
             x_out = decoder(xs_quantised[level : level + 1], all_levels=False)
             x_outs.append(x_out)
 
-        # Loss
-        def _spectral_loss(x_target, x_out, hps):
-            if hps.use_nonrelative_specloss:
-                sl = spectral_loss(x_target, x_out, hps) / hps.bandwidth["spec"]
-            else:
-                sl = spectral_convergence(x_target, x_out, hps)
-            sl = torch.mean(sl)
-            return sl
-
-        def _multispectral_loss(x_target, x_out, hps):
-            sl = multispectral_loss(x_target, x_out, hps) / hps.bandwidth["spec"]
-            sl = torch.mean(sl)
-            return sl
-
-        recons_loss = torch.zeros(()).to(x.device)
-        spec_loss = torch.zeros(()).to(x.device)
-        multispec_loss = torch.zeros(()).to(x.device)
-        x_target = x.float()
-
         for level in reversed(range(self.levels)):
             x_out = self.postprocess(x_outs[level])
-            this_recons_loss = _loss_fn(loss_fn, x_target, x_out, hps)
-            this_spec_loss = _spectral_loss(x_target, x_out, hps)
-            this_multispec_loss = _multispectral_loss(x_target, x_out, hps)
-            metrics[f"recons_loss_l{level + 1}"] = this_recons_loss
-            metrics[f"spectral_loss_l{level + 1}"] = this_spec_loss
-            metrics[f"multispectral_loss_l{level + 1}"] = this_multispec_loss
-            recons_loss += this_recons_loss
-            spec_loss += this_spec_loss
-            multispec_loss += this_multispec_loss
 
         commit_loss = sum(commit_losses)
-        loss = (
-            recons_loss + self.spectral * spec_loss + self.multispectral * multispec_loss + self.commit * commit_loss
-        )
-
-        with torch.no_grad():
-            sc = torch.mean(spectral_convergence(x_target, x_out, hps))
-            l2_loss = _loss_fn("l2", x_target, x_out, hps)
-            l1_loss = _loss_fn("l1", x_target, x_out, hps)
-            linf_loss = _loss_fn("linf", x_target, x_out, hps)
-
-        quantiser_metrics = average_metrics(quantiser_metrics)
-
-        metrics.update(
-            dict(
-                recons_loss=recons_loss,
-                spectral_loss=spec_loss,
-                multispectral_loss=multispec_loss,
-                spectral_convergence=sc,
-                l2_loss=l2_loss,
-                l1_loss=l1_loss,
-                linf_loss=linf_loss,
-                commit_loss=commit_loss,
-                **quantiser_metrics,
-            )
-        )
-
-        for key, val in metrics.items():
-            metrics[key] = val.detach()
+        loss = self.commit * commit_loss
 
-        return x_out, loss, metrics
+        return x_out, loss
 
 
 # Scalable transformer
@@ -1311,7 +1170,7 @@ def del_cache(self):
         if "value" in self.cache:
             del self.cache["value"]
         self.cache = {}
-        
+
     def check_cache(self, n_samples, sample_t, fp16):
         assert self.sample_t == sample_t, f"{self.sample_t} != {sample_t}"
         if sample_t == 0:
@@ -1326,7 +1185,6 @@ def check_cache(self, n_samples, sample_t, fp16):
 
 
 class JukeboxBlock(nn.Module):
-    # previously ResAttnBlock
     def __init__(
         self,
         width,
@@ -1369,7 +1227,7 @@ def __init__(
             encoder_dims=encoder_dims,
             prime_len=prime_len,
         )
-        
+
         self.ln_0 = JukeboxLayerNorm(width)
         self.mlp = JukeboxMLP(
             width=width,
@@ -1382,9 +1240,10 @@ def __init__(
         self.ln_1 = JukeboxLayerNorm(width)
         self.res_scale = res_scale
 
-        # TODO either support checkpointing for faster inference or get rid of this 
+        # TODO either support checkpointing for faster inference or get rid of this
         self.checkpoint_attn = checkpoint_attn
         self.checkpoint_mlp = checkpoint_mlp
+        
         self.width = width
         self.attn_func = attn_func
 
@@ -1538,6 +1397,7 @@ def del_cache(self):
         for l in self._attn_mods:
             l.attn.del_cache()
 
+
 class JukeboxPositionalEmbedding(nn.Module):
     def __init__(self, input_shape, width, init_scale=1.0, pos_init=False):
         super().__init__()
@@ -1555,7 +1415,6 @@ def forward(self):
 
 
 class JukeboxConditionalAutoregressive(nn.Module):
-    # previously ConditionalAutoregressive2D, renamed it to prior
     def __init__(
         self,
         input_shape,
@@ -1665,7 +1524,7 @@ def postprocess(self, x, sample_tokens=None):
             return x.view(N, *self.input_shape)
         else:
             return x.view(N, -1)
-        
+
     # TODO RENAME x, x_cond and y_cond, x_prime, x_gen, x_t
     def forward(
         self,
@@ -1743,7 +1602,7 @@ def get_emb(self, sample_t, n_samples, x, x_cond, y_cond):
             cond = x_cond
         x = x + self.pos_emb()[sample_t : sample_t + 1] + cond  # Pos emb, dropout is identity at eval time
         return x, cond
-    
+
     # TODO rename x, x_conds, y_conds
     def sample(
         self,
@@ -1773,14 +1632,13 @@ def sample(
                 preds = []
 
             for sample_t in get_range(range(0, sample_tokens)):
-
                 hidden_states, cond = self.get_emb(sample_t, n_samples, tokens, x_cond, y_cond)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 hidden_states = self.transformer(
                     hidden_states, encoder_kv=encoder_kv, sample=True, fp16=fp16
                 )  # TODO put fp16 back # Transformer
                 if self.add_cond_after_transformer:
-                    hidden_states= hidden_states + cond
+                    hidden_states = hidden_states + cond
                 hidden_states = self.x_out(hidden_states)  # Predictions
                 if get_preds:
                     preds.append(hidden_states.clone())
@@ -1817,7 +1675,6 @@ def primed_sample(
         chunk_size=None,
         sample_tokens=None,
     ):
-
         if sample_tokens is None:
             sample_tokens = self.input_dims
         # Preprocess.
@@ -1847,7 +1704,6 @@ def primed_sample(
             x = None
 
             for current_chunk_size in get_range(chunk_sizes):
-
                 xs_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
                     x_prime, cond_prime = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
@@ -2027,7 +1883,6 @@ def __init__(self, bins, out_width, init_scale):
         super().__init__()
         self.bins = bins
         self.emb = nn.Embedding(bins, out_width)
-        # nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
 
     def forward(self, y):
         return self.emb(y)
@@ -2082,6 +1937,7 @@ def forward(self, pos_start, pos_end=None):
         bins = (self.bins * normalised_position).floor().long().detach()  # [0,1) -> [0,1..,bins) -> [0,1...,bins-1]
         return self.emb(bins)
 
+
 # TODO rename y_bins and t_bins as well as y
 class LabelConditioner(nn.Module):
     def __init__(
@@ -2142,7 +1998,8 @@ def forward(self, y):
             pos_emb = None
         return start_emb, pos_emb
 
-# TODO rename every conditioning 
+
+# TODO rename every conditioning
 class JukeboxPrior(nn.Module):
     """
     Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
@@ -2661,7 +2518,9 @@ def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
     bs, total_length = tokens.shape[0], tokens.shape[1]
     if total_length < n_ctx:
         padding_length = n_ctx - total_length
-        tokens = torch.cat([tokens, torch.zeros(bs, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1)
+        tokens = torch.cat(
+            [tokens, torch.zeros(bs, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1
+        )
         total_length = tokens.shape[1]
     else:
         padding_length = 0
@@ -2677,7 +2536,7 @@ def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
         end = start + n_ctx
 
         # set y offset, sample_length and lyrics tokens
-        y, indices_hop = prior.get_y(labels, start, hps.sample_length, get_indices=True,offset=0)
+        y, indices_hop = prior.get_y(labels, start, hps.sample_length, get_indices=True, offset=0)
 
         tokens_bs = torch.chunk(tokens, bs, dim=0)
         y_bs = torch.chunk(y, bs, dim=0)
@@ -2723,7 +2582,7 @@ def save_wav(fname, lvl, metas, aud, sr):
     for i in list(range(aud.shape[0])):
         if metas is not None:
             # twitter prompts or inputs are in the form of a dictionnary
-            artists, genres, lyrics =  list(metas)[i].values()
+            artists, genres, lyrics = list(metas)[i].values()
             path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}.wav"
             soundfile.write(path, aud[i], samplerate=sr, format="wav")
         else:
@@ -2845,9 +2704,7 @@ def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, tot
         print(f"Sampling level {level}")
         if total_length >= self.priors[level].n_ctx:
             for start in get_range(get_starts(total_length, self.priors[level].n_ctx, hop_length)):
-                music_tokens = self.sample_single_window(
-                    music_tokens, labels, offset, sampling_kwargs, level, start
-                )
+                music_tokens = self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start)
 
         else:
             music_tokens = self.sample_partial_window(
@@ -2906,7 +2763,7 @@ def _sample(
         if sample_levels is None:
             sample_levels = range(len(self.priors))
         for level in reversed(sample_levels):
-            self.config.sample_length = total_length  # total length of the signal, might be bit different 
+            self.config.sample_length = total_length  # total length of the signal, might be bit different
             # from the actual generated length
             self.priors[level].to(music_tokens[level].device).eval()
             empty_cache()
@@ -2934,11 +2791,9 @@ def _sample(
                     os.makedirs(logdir)
                 # torch.save(dict(music_tokens=music_tokens, labels=labels, sampling_kwargs=sampling_kwargs, raw_audio=raw_audio), f"{logdir}/data.pth.tar")
                 save_wav(logdir, level, metas=metas, aud=raw_audio, sr=self.config.sr)
-                if (
-                    alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0
-                ): 
+                if alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0:
                     empty_cache()
-                    #alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], self.config)
+                    # alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], self.config)
                     pass  # consumes too much ram
         return music_tokens
 
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 875b486beb3bb..fb874a17fd4f0 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -398,7 +398,6 @@ def convert_to_tensors(
             if not is_tensor(inputs):
                 inputs = as_tensor(inputs)
         except:  # noqa E722
-
             raise ValueError(
                 "Unable to create tensor, you should probably activate truncation and/or padding "
                 "with 'padding=True' 'truncation=True' to have batched tensors with the same length."

From 6dd8d073f85d483a5ddb3b40b4f67ee556ad076c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 8 Aug 2022 08:53:42 +0000
Subject: [PATCH 077/196] clean VQVAE code

---
 .../models/jukebox/modeling_jukebox.py        | 375 +++++++++---------
 1 file changed, 188 insertions(+), 187 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index eb6b88f583445..29e118bf2c0b5 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -284,8 +284,7 @@ def forward(self, hidden_states):
         all_hidden_states = []
 
         # 64, 32, ...
-        iterator = zip(list(range(self.levels)), self.downs_t, self.strides_t)
-        for level, down_t, stride_t in iterator:
+        for level in range(self.levels):
             level_block = self.level_blocks[level]
             hidden_states = level_block(hidden_states)
             all_hidden_states.append(hidden_states)
@@ -316,8 +315,7 @@ def forward(self, xs, all_levels=True):
         hidden_states = xs[-1]
 
         # 32, 64 ...
-        iterator = reversed(list(zip(list(range(self.levels)), self.downs_t, self.strides_t)))
-        for level, down_t, stride_t in iterator:
+        for level in reversed(range(self.levels)):
             level_block = self.level_blocks[level]
             hidden_states = level_block(hidden_states)
 
@@ -342,88 +340,85 @@ def calculate_strides(strides, downs):
     return [stride**down for stride, down in zip(strides, downs)]
 
 # rename TODO
-class BottleneckBlock(nn.Module):
-    def __init__(self, k_bins, emb_width, mu):
+class JukeboxBottleneckBlock(nn.Module):
+    def __init__(self, codebook_dim, codebook_width, mu):
         super().__init__()
-        self.k_bins = k_bins
-        self.emb_width = emb_width
+        self.codebook_dim = codebook_dim
+        self.codebook_width = codebook_width
         self.mu = mu
-        self.reset_k()
+        self.reset_codebook()
         self.threshold = 1.0
 
-    def reset_k(self):
+    def reset_codebook(self):
         self.init = False
-        self.k_sum = None
-        self.k_elem = None
-        self.register_buffer("k", torch.zeros(self.k_bins, self.emb_width))
+        self.codebook_sum = None
+        self.codebook_elem = None
+        self.register_buffer("codebook", torch.zeros(self.codebook_dim, self.codebook_width))
 
     def _tile(self, hidden_states):
-        d, ew = hidden_states.shape
-        if d < self.k_bins:
-            n_repeats = (self.k_bins + d - 1) // d
-            std = 0.01 / np.sqrt(ew)
+        dim, embed_width = hidden_states.shape
+        if dim < self.codebook_dim:
+            n_repeats = (self.codebook_dim + dim - 1) // dim
+            std = 0.01 / np.sqrt(embed_width)
             hidden_states = hidden_states.repeat(n_repeats, 1)
             hidden_states = hidden_states + torch.randn_like(hidden_states) * std
         return hidden_states
 
-    def init_k(self, hidden_states):
-        # TODO rename y, k_bins and k to a way more meaningful name
-
-        _, k_bins = self.emb_width, self.k_bins  # mu,
-        self.init = True
-        # init k_w using random vectors from hidden_states
-        y = self._tile(hidden_states)
-        _k_rand = y[torch.randperm(y.shape[0])][:k_bins]
-        # dist.broadcast(_k_rand, 0)
-        self.k = _k_rand
-        self.k_sum = self.k
-        self.k_elem = torch.ones(k_bins, device=self.k.device)
-
-    def restore_k(self, num_tokens=None, threshold=1.0):
-        k_bins = self.k_bins  # mu -> _
+    def init_codebook(self, hidden_states):
+        codebook_dim = self.codebook_dim  # mu,
         self.init = True
-        self.k_sum = self.k.clone()
-        self.k_elem = torch.ones(k_bins, device=self.k.device)
-        if num_tokens is not None:
-            expected_usage = num_tokens / k_bins
-            self.k_elem.data.mul_(expected_usage)
-            self.k_sum.data.mul_(expected_usage)
-        self.threshold = threshold
-
-    def update_k(self, hidden_states, x_l):
-        mu, emb_width, k_bins = self.mu, self.emb_width, self.k_bins
+        # init k_w using random vectors from hidden_states codebook_w (index w?)
+        codes = self._tile(hidden_states)
+        # _k_rand = codes[torch.randperm(codes.shape[0])][:codebook_dim]
+        self.codebook = codes[torch.randperm(codes.shape[0])][:codebook_dim]
+        self.codebook_sum = self.codebook
+        self.codebook_elem = torch.ones(codebook_dim, device=self.codebook.device)
+
+    # def restore_k(self, num_tokens=None, threshold=1.0):
+    #     codebook_dim = self.codebook_dim  # mu -> _
+    #     self.init = True
+    #     self.codebook_sum = self.codebook.clone()
+    #     self.codebook_elem = torch.ones(codebook_dim, device=self.codebook.device)
+    #     if num_tokens is not None:
+    #         expected_usage = num_tokens / codebook_dim
+    #         self.codebook_elem.data.mul_(expected_usage)
+    #         self.codebook_sum.data.mul_(expected_usage)
+    #     self.threshold = threshold
+
+    def update_codebook(self, hidden_states, latent_states):
+        mu, codebook_width, codebook_dim = self.mu, self.codebook_width, self.codebook_dim
         with torch.no_grad():
             # Calculate new centres
-            x_l_onehot = torch.zeros(k_bins, hidden_states.shape[0], device=hidden_states.device)  # k_bins, N * L
-            x_l_onehot.scatter_(0, x_l.view(1, hidden_states.shape[0]), 1)
+            latent_states_onehot = torch.zeros(codebook_dim, hidden_states.shape[0], device=hidden_states.device)  # codebook_dim, N * L
+            latent_states_onehot.scatter_(0, latent_states.view(1, hidden_states.shape[0]), 1)
 
-            _k_sum = torch.matmul(x_l_onehot, hidden_states)  # k_bins, w
-            _k_elem = x_l_onehot.sum(dim=-1)  # k_bins
-            y = self._tile(hidden_states)
-            _k_rand = y[torch.randperm(y.shape[0])][:k_bins]
+            _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)  # codebook_dim, w
+            _codebook_elem = latent_states_onehot.sum(dim=-1)  # codebook_dim
+            codes = self._tile(hidden_states)
+            _random_codebook = codes[torch.randperm(codes.shape[0])][:codebook_dim]
 
             # Update centres
-            old_k = self.k
-            self.k_sum = mu * self.k_sum + (1.0 - mu) * _k_sum  # w, k_bins
-            self.k_elem = mu * self.k_elem + (1.0 - mu) * _k_elem  # k_bins
-            usage = (self.k_elem.view(k_bins, 1) >= self.threshold).float()
-            self.k = usage * (self.k_sum.view(k_bins, emb_width) / self.k_elem.view(k_bins, 1)) + (1 - usage) * _k_rand
-            _k_prob = _k_elem / torch.sum(_k_elem)  # x_l_onehot.mean(dim=-1)  # prob of each bin
-            entropy = -torch.sum(_k_prob * torch.log(_k_prob + 1e-8))  # entropy ie how diverse
-            used_curr = (_k_elem >= self.threshold).sum()
+            old_codebook = self.codebook
+            self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum  # w, codebook_dim
+            self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem  # codebook_dim
+            usage = (self.codebook_elem.view(codebook_dim, 1) >= self.threshold).float()
+            self.codebook = usage * (self.codebook_sum.view(codebook_dim, codebook_width) / self.codebook_elem.view(codebook_dim, 1)) + (1 - usage) * _random_codebook
+            _codebook_prob = _codebook_elem / torch.sum(_codebook_elem)  # latent_states_onehot.mean(dim=-1)  # prob of each bin
+            entropy = -torch.sum(_codebook_prob * torch.log(_codebook_prob + 1e-8))  # entropy ie how diverse
+            used_curr = (_codebook_elem >= self.threshold).sum()
             usage = torch.sum(usage)
-            dk = torch.norm(self.k - old_k) / np.sqrt(np.prod(old_k.shape))
+            dk = torch.norm(self.codebook - old_codebook) / np.sqrt(np.prod(old_codebook.shape))
         return dict(entropy=entropy, used_curr=used_curr, usage=usage, dk=dk)
 
     def preprocess(self, hidden_states):
         # NCT -> NTC -> [NT, C]
         hidden_states = hidden_states.permute(0, 2, 1).contiguous()
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # x_en = (N * L, w), k_j = (w, k_bins)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # x_en = (N * L, w), k_j = (w, codebook_dim)
 
-        if hidden_states.shape[-1] == self.emb_width:
+        if hidden_states.shape[-1] == self.codebook_width:
             prenorm = torch.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(np.prod(hidden_states.shape))
-        elif hidden_states.shape[-1] == 2 * self.emb_width:
-            x1, x2 = hidden_states[..., : self.emb_width], hidden_states[..., self.emb_width :]
+        elif hidden_states.shape[-1] == 2 * self.codebook_width:
+            x1, x2 = hidden_states[..., : self.codebook_width], hidden_states[..., self.codebook_width :]
             prenorm = (torch.norm(x1 - torch.mean(x1)) / np.sqrt(np.prod(x1.shape))) + (
                 torch.norm(x2 - torch.mean(x2)) / np.sqrt(np.prod(x2.shape))
             )
@@ -433,120 +428,119 @@ def preprocess(self, hidden_states):
 
         return hidden_states, prenorm
 
-    def postprocess(self, x_l, x_d, x_shape):
+    def postprocess(self, latent_states, dequantised_states, x_shape):
         # [NT, C] -> NTC -> NCT
         N, T = x_shape
-        x_d = x_d.view(N, T, -1).permute(0, 2, 1).contiguous()
-        x_l = x_l.view(N, T)
-        return x_l, x_d
+        dequantised_states= dequantised_states.view(N, T, -1).permute(0, 2, 1).contiguous()
+        latent_states = latent_states.view(N, T)
+        return latent_states, dequantised_states
 
-    def quantise(self, hidden_states):
-        # Calculate latent code x_l
-        k_w = self.k.t()
+    def quantise(self, latent_states):
+        # Calculate latent code latent_states
+        codebook_weights = self.codebook.t()
         distance = (
-            torch.sum(hidden_states**2, dim=-1, keepdim=True)
-            - 2 * torch.matmul(hidden_states, k_w)
-            + torch.sum(k_w**2, dim=0, keepdim=True)
+            torch.sum(latent_states**2, dim=-1, keepdim=True)
+            - 2 * torch.matmul(latent_states, codebook_weights)
+            + torch.sum(codebook_weights**2, dim=0, keepdim=True)
         )  # (N * L, b)
-        min_distance, x_l = torch.min(distance, dim=-1)
+        min_distance, music_tokens = torch.min(distance, dim=-1)
         fit = torch.mean(min_distance)
-        return x_l, fit
+        return music_tokens, fit
 
-    def dequantise(self, x_l):
-        x = F.embedding(x_l, self.k)
-        return x
+    def dequantise(self, music_tokens):
+        dequantised_states = F.embedding(music_tokens, self.codebook)
+        return dequantised_states
 
-    def encode(self, hidden_states):
-        N, _, T = hidden_states.shape
+    def encode(self, latent_states):
+        samples, _, seq_length = latent_states.shape
 
         # Preprocess.
-        hidden_states, _ = self.preprocess(hidden_states)
+        latent_states, _ = self.preprocess(latent_states)
 
         # Quantise
-        x_l, _ = self.quantise(hidden_states)
+        music_tokens, _ = self.quantise(latent_states)
 
         # Postprocess.
-        x_l = x_l.view(N, T)
-        return x_l
+        music_tokens = music_tokens.view(samples, seq_length)
+        return music_tokens
 
-    def decode(self, x_l):
-        N, T = x_l.shape
-        width = self.emb_width
+    def decode(self, music_tokens):
+        samples, seq_length = music_tokens.shape
 
         # Dequantise
-        x_d = self.dequantise(x_l)
+        dequantised_states = self.dequantise(music_tokens)
 
         # Postprocess
-        x_d = x_d.view(N, T, width).permute(0, 2, 1).contiguous()
-        return x_d
+        dequantised_states = dequantised_states.view(samples, seq_length, self.codebook_width).permute(0, 2, 1).contiguous()
+        return dequantised_states
 
-    def forward(self, hidden_states, update_k=True):
-        N, width, T = hidden_states.shape
+    def forward(self, hidden_states, update_codebook=True):
+        samples, width, seq_length = hidden_states.shape
 
         # Preprocess
         hidden_states, prenorm = self.preprocess(hidden_states)
 
         # Init k if not inited
-        if update_k and not self.init:
-            self.init_k(hidden_states)
+        if update_codebook and not self.init:
+            self.init_codebook(hidden_states)
 
         # Quantise and dequantise through bottleneck
-        x_l, fit = self.quantise(hidden_states)
-        x_d = self.dequantise(x_l)
+        music_tokens, fit = self.quantise(hidden_states)
+        dequantised_states= self.dequantise(music_tokens)
 
         # Update embeddings
-        if update_k:
-            update_metrics = self.update_k(hidden_states, x_l)
+        if update_codebook:
+            update_metrics = self.update_codebook(hidden_states, latent_states)
         else:
             update_metrics = {}
 
         # Loss
-        commit_loss = torch.norm(x_d.detach() - hidden_states) ** 2 / np.prod(hidden_states.shape)
+        commit_loss = torch.norm(dequantised_states.detach() - hidden_states) ** 2 / np.prod(hidden_states.shape)
 
         # Passthrough
-        x_d = hidden_states + (x_d - hidden_states).detach()
+        dequantised_states= hidden_states + (dequantised_states- hidden_states).detach()
 
         # Postprocess
-        x_l, x_d = self.postprocess(x_l, x_d, (N, T))
-        return x_l, x_d, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
+        latent_states, dequantised_states= self.postprocess(latent_states, dequantised_states, (samples, seq_length))
+        return latent_states, dequantised_states, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
 
 
-class Bottleneck(nn.Module):
-    def __init__(self, l_bins, emb_width, mu, levels):
+class JukeboxBottleneck(nn.Module):
+    def __init__(self, codebook_dim, codebook_width, mu, levels):
         super().__init__()
         self.levels = levels
         self.level_blocks = nn.ModuleList()
         for level in range(self.levels):
-            self.level_blocks.append(BottleneckBlock(l_bins, emb_width, mu))
+            self.level_blocks.append(JukeboxBottleneckBlock(codebook_dim, codebook_width, mu))
 
-    def encode(self, xs):
-        music_tokens = [level_block.encode(x) for (level_block, x) in zip(self.level_blocks, xs)]
+    def encode(self, raw_audio):
+        music_tokens = [level_block.encode(x) for (level_block, x) in zip(self.level_blocks, raw_audio)]
         return music_tokens
 
     def decode(self, music_tokens, start_level=0, end_level=None):
         if end_level is None:
             end_level = self.levels
-        xs_quantised = [
+        quantised_audio = [
             level_block.decode(z) for (level_block, z) in zip(self.level_blocks[start_level:end_level], music_tokens)
         ]
-        return xs_quantised
+        return quantised_audio
 
-    def forward(self, xs):
-        music_tokens, xs_quantised, commit_losses, metrics = [], [], [], []
+    def forward(self, input_audio):
+        music_tokens, quantised_states, commit_losses, metrics = [], [], [], []
         for level in range(self.levels):
             level_block = self.level_blocks[-level - 1]
-            x = xs[level]
-            z, x_quantised, commit_loss, metric = level_block(x, update_k=self.training)
-            music_tokens.append(z)
+            hidden_states = input_audio[level]
+            sampled_tokens, quantised_states, commit_loss, metric = level_block(hidden_states, update_codebook=self.training)
+            music_tokens.append(sampled_tokens)
             if not self.training:
                 # Be extra paranoid and make sure the encoder weights can't
                 # change from straight-through estimator
-                x_quantised = x_quantised.detach()
-            xs_quantised.append(x_quantised)
+                quantised_state = quantised_state.detach()
+            quantised_states.append(quantised_state)
             commit_losses.append(commit_loss)
             if self.training:
                 metrics.append(metric)
-        return music_tokens, xs_quantised, commit_losses, metrics
+        return music_tokens, quantised_states, commit_losses, metrics
 
 
 class JukeboxVQVAE(PreTrainedModel):
@@ -570,13 +564,13 @@ def __init__(self, config):
         )
 
         multipliers = config.vq_vae_multipliers
-        emb_width = config.vq_vae_emmbedding_width
+        codebook_width = config.vq_vae_emmbedding_width
         self.width = config.vq_vae_width
         self.depth = config.vq_vae_depth
 
         self.downs_t = downs_t = config.vq_vae_downs_t
         self.strides_t = strides_t = config.vq_vae_strides_t
-        self.l_bins = l_bins = config.vq_vae_codebook_dimension
+        self.codebook_dim = codebook_dim = config.vq_vae_codebook_dimension
         self.commit = config.vq_vae_commit
         self.spectral = config.spectral
         self.multispectral = config.multispectral
@@ -603,12 +597,12 @@ def _block_kwargs(level):
 
         def encoder(level):
             return Encoder(
-                x_channels, emb_width, level + 1, downs_t[: level + 1], strides_t[: level + 1], **_block_kwargs(level)
+                x_channels, codebook_width, level + 1, downs_t[: level + 1], strides_t[: level + 1], **_block_kwargs(level)
             )
 
         def decoder(level):
             return Decoder(
-                x_channels, emb_width, level + 1, downs_t[: level + 1], strides_t[: level + 1], **_block_kwargs(level)
+                x_channels, codebook_width, level + 1, downs_t[: level + 1], strides_t[: level + 1], **_block_kwargs(level)
             )
 
         self.encoders = nn.ModuleList()
@@ -617,92 +611,92 @@ def decoder(level):
             self.encoders.append(encoder(level))
             self.decoders.append(decoder(level))
 
-        self.bottleneck = Bottleneck(l_bins, emb_width, config.vq_vae_lmu, levels)
+        self.bottleneck = JukeboxBottleneck(codebook_dim, codebook_width, config.vq_vae_lmu, levels)
 
-    def preprocess(self, x):
+    def preprocess(self, raw_audio):
         # x: NTC [-1,1] -> NCT [-1,1]
-        x = x.permute(0, 2, 1).float()
-        return x
+        raw_audio = raw_audio.permute(0, 2, 1).float()
+        return raw_audio
 
-    def postprocess(self, x):
+    def postprocess(self, dequantised_states):
         # x: NTC [-1,1] <- NCT [-1,1]
-        x = x.permute(0, 2, 1)
-        return x
+        dequantised_states = dequantised_states.permute(0, 2, 1)
+        return dequantised_states
 
     def _decode(self, music_tokens, start_level=0, end_level=None):
         # Decode
         if end_level is None:
             end_level = self.levels
-        xs_quantised = self.bottleneck.decode(music_tokens, start_level=start_level, end_level=end_level)
+        latent_states = self.bottleneck.decode(music_tokens, start_level=start_level, end_level=end_level)
         # Use only lowest level
-        decoder, x_quantised = self.decoders[start_level], xs_quantised[0:1]
-        x_out = decoder(x_quantised, all_levels=False)
-        x_out = self.postprocess(x_out)
-        return x_out
+        decoder, dequantised_state = self.decoders[start_level], latent_states[0:1]
+        dequantised_state = decoder(dequantised_state, all_levels=False)
+        dequantised_state = self.postprocess(dequantised_state)
+        return dequantised_state
 
     def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):
-        z_chunks = [torch.chunk(z, bs_chunks, dim=0) for z in music_tokens]
-        x_outs = []
+        token_chunks = [torch.chunk(token, bs_chunks, dim=0) for token in music_tokens]
+        dequantised_states = []
         for i in range(bs_chunks):
-            music_tokens_i = [z_chunk[i] for z_chunk in z_chunks]
-            x_out = self._decode(music_tokens_i, start_level=start_level, end_level=end_level)
-            x_outs.append(x_out)
-        return torch.cat(x_outs, dim=0)
+            music_tokens_i = [chunks[i] for chunks in token_chunks]
+            dequantised_state = self._decode(music_tokens_i, start_level=start_level, end_level=end_level)
+            dequantised_states.append(dequantised_state)
+        return torch.cat(dequantised_states, dim=0)
 
-    def _encode(self, x, start_level=0, end_level=None):
+    def _encode(self, raw_audio, start_level=0, end_level=None):
         # Encode
         if end_level is None:
             end_level = self.levels
-        x_in = self.preprocess(x)
-        xs = []
+        input_audio = self.preprocess(raw_audio)
+        latent_states = []
         for level in range(self.levels):
             encoder = self.encoders[level]
-            x_out = encoder(x_in)
-            xs.append(x_out[-1])
-        music_tokens = self.bottleneck.encode(xs)
+            latent_state = encoder(input_audio)
+            latent_states.append(latent_state[-1])
+        music_tokens = self.bottleneck.encode(latent_states)
         return music_tokens[start_level:end_level]
 
-    def encode(self, x, start_level=0, end_level=None, bs_chunks=1):
-        x_chunks = torch.chunk(x, bs_chunks, dim=0)
+    def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
+        audio_chunks = torch.chunk(input_audio, bs_chunks, dim=0)
         music_tokens_list = []
-        for x_i in x_chunks:
-            music_tokens_i = self._encode(x_i, start_level=start_level, end_level=end_level)
+        for chunk_i in audio_chunks:
+            music_tokens_i = self._encode(chunk_i, start_level=start_level, end_level=end_level)
             music_tokens_list.append(music_tokens_i)
         music_tokens = [
-            torch.cat(music_tokens_level_list, dim=0) for music_tokens_level_list in zip(*music_tokens_list)
+            torch.cat(music_tokens_level, dim=0) for music_tokens_level in zip(*music_tokens_list)
         ]
         return music_tokens
 
     def sample(self, n_samples):
         music_tokens = [
-            torch.randint(0, self.l_bins, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes
+            torch.randint(0, self.codebook_dim, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes
         ]
         return self.decode(music_tokens)
 
     # TODO rename
-    def forward(self, x, hps):
+    def forward(self, raw_audio):
         # Encode/Decode
-        x_in = self.preprocess(x)
-        xs = []
+        input_audio = self.preprocess(raw_audio)
+        latent_states = []
         for level in range(self.levels):
             encoder = self.encoders[level]
-            x_out = encoder(x_in)
-            xs.append(x_out[-1])
+            latent_state = encoder(input_audio)
+            latent_states.append(latent_state[-1])
 
-        music_tokens, xs_quantised, commit_losses, quantiser_metrics = self.bottleneck(xs)
-        x_outs = []
+        _, quantised_audio, commit_losses, _ = self.bottleneck(latent_states)
+        dequantised_states = []
         for level in range(self.levels):
             decoder = self.decoders[level]
-            x_out = decoder(xs_quantised[level : level + 1], all_levels=False)
-            x_outs.append(x_out)
+            dequantised_state = decoder(quantised_audio[level : level + 1], all_levels=False)
+            dequantised_state.append(dequantised_state)
 
         for level in reversed(range(self.levels)):
-            x_out = self.postprocess(x_outs[level])
+            dequantised_state = self.postprocess(dequantised_states[level])
 
         commit_loss = sum(commit_losses)
         loss = self.commit * commit_loss
 
-        return x_out, loss
+        return dequantised_state, loss
 
 
 # Scalable transformer
@@ -1838,44 +1832,51 @@ def __init__(
         self.x_emb = nn.Embedding(bins, out_width)
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
 
-        # MusicTokenConditioner, takes as input either uper level tokens or raw audio?
+        # MusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level? 
         self.cond = DecoderConvBock(
             self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale
         )
+        # TODO rename all ln to layer_norm
         self.ln = JukeboxLayerNorm(self.width)
 
-    def preprocess(self, x):
-        x = x.permute(0, 2, 1)  # NTC -> NCT
-        return x
+    def preprocess(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)  # NTC -> NCT
+        return hidden_states
 
-    def postprocess(self, x):
-        x = x.permute(0, 2, 1)  # NCT -> NTC
-        return x
+    def postprocess(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)  # NCT -> NTC
+        return hidden_states
 
     # TODO rename to raw audio and hidden states
-    def forward(self, x, x_cond=None):
-        if x_cond is None:
-            x_cond = 0.0
+    def forward(self, music_tokens, raw_audio_conditionning=None):
+        """
+            Args : 
+                - music_tokens : indexes of codebook vectors
+                - raw_audio_conditionning : used when prime sampling, raw audio information that conditions 
+                the generation
+        """
+        if raw_audio_conditionning is None:
+            raw_audio_conditionning = 0.0
         # Embed x
-        x = x.long()
-        x = self.x_emb(x)
-        x = x + x_cond
+        music_tokens = music_tokens.long()
+        hidden_states = self.x_emb(music_tokens)
+        hidden_states = hidden_states + raw_audio_conditionning
 
         # Run conditioner
-        x = self.preprocess(x)
-        x = self.cond(x)
-        x = self.postprocess(x)
-        x = self.ln(x)
-        return x
+        hidden_states = self.preprocess(hidden_states)
+        hidden_states = self.cond(hidden_states)
+        hidden_states = self.postprocess(hidden_states)
+        hidden_states = self.ln(hidden_states)
+        return hidden_states
 
 
-def flip(x):
-    def _flip(x):
-        return x.permute(0, 2, 1).contiguous()
+def flip(hidden_states):
+    def _flip(hidden_states):
+        return hidden_states.permute(0, 2, 1).contiguous()
 
-    if isinstance(x, (list, tuple)):
-        return [flip(z) for z in x]
-    return _flip(x)
+    if isinstance(hidden_states, (list, tuple)):
+        return [flip(z) for z in hidden_states]
+    return _flip(hidden_states)
 
 
 class SimpleEmbedding(nn.Module):
@@ -2112,7 +2113,7 @@ def rescale(z_shape):
         self.y_cond = config.labels
 
         self.single_enc_dec = config.single_enc_dec[-level - 1]
-        # X conditioning : conditioning on music tokens (either from audio or from previous levels )
+        # X conditioning : conditioning on music tokens (either from audio or from previous levels or both)
         if self.x_cond:
             self.conditioner_blocks = nn.ModuleList()
 
@@ -2510,7 +2511,7 @@ def get_starts(total_length, n_ctx, hop_length):
         starts.append(start)
     return starts
 
-
+# TODO fix this, consumes too much RAM
 def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
     level = level - 1  # Top level used
     n_ctx, n_tokens = prior.n_ctx, prior.n_tokens
@@ -2793,7 +2794,7 @@ def _sample(
                 save_wav(logdir, level, metas=metas, aud=raw_audio, sr=self.config.sr)
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0:
                     empty_cache()
-                    # alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], self.config)
+                    alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], self.config)
                     pass  # consumes too much ram
         return music_tokens
 

From c097f81b513220fbc59cb10214e27dd4b8deb3ef Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 8 Aug 2022 09:28:11 +0000
Subject: [PATCH 078/196] style

---
 .../models/jukebox/convert_jukebox.py         |  26 +-
 .../models/jukebox/modeling_jukebox.py        | 224 ++++++++++--------
 2 files changed, 137 insertions(+), 113 deletions(-)

diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index ac05386181f91..55aac464a7e5e 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -65,24 +65,18 @@ def fix_jukebox_keys(state_dict, model_state_dict):
                 key = wo_model[0] + "proj_out." + wo_model[1].split(".")[-1]
             else:
                 block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]))
-                key = (
-                    wo_model[0] + "downsample_block." + block_index + "." + wo_model[1].split(".")[-1]
-                )
+                key = wo_model[0] + "downsample_block." + block_index + "." + wo_model[1].split(".")[-1]
         elif len(wo_model) == 2 and "decoders" in key:
             if len(wo_model[1].split(".")) <= 3:
                 key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
             else:
-                block_index = str(
-                    int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
-                )
+                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
                 key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
         elif len(wo_model) == 2 and "cond.model." in key:
             if len(wo_model[1].split(".")) <= 3:
                 key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
             else:
-                block_index = str(
-                    int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
-                )
+                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
                 key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
         elif len(wo_model) == 3 and "priors" in key:
             # should also rename cond to low_lvl_conditioner
@@ -137,10 +131,7 @@ def fix_jukebox_keys(state_dict, model_state_dict):
         if key not in model_state_dict.keys():
             print(f"failed converting {original_key} to {key}, does not match")
         elif value.shape != model_state_dict[key].shape:
-            print(
-                f"{original_key}-> {key} : \nshape {model_state_dict[key].shape} and { value.shape},"
-                " do not match"
-            )
+            print(f"{original_key}-> {key} : \nshape {model_state_dict[key].shape} and { value.shape}, do not match")
             key = original_key
         new_dict[key] = value
     return new_dict
@@ -162,11 +153,10 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
 
     config = JukeboxConfig.from_pretrained(model_name)
     model = JukeboxModel(config)
-    
+
     weight_dict = []
     for dict_name in priors:
         old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}")["model"]
-        
 
         new_dic = {}
         for k in old_dic.keys():
@@ -178,11 +168,9 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
                 new_dic[k.replace(".blocks.", ".model.")] = old_dic[k]
             else:
                 new_dic[k] = old_dic[k]
-                
-        new_dic = fix_jukebox_keys(new_dic,model.state_dict())
-        weight_dict.append(new_dic)
 
-    
+        new_dic = fix_jukebox_keys(new_dic, model.state_dict())
+        weight_dict.append(new_dic)
 
     model.vqvae.load_state_dict(vqvae_dic)
     for i in range(len(weight_dict)):
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 29e118bf2c0b5..2dbb92261e0af 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -81,7 +81,6 @@ def get_range(x):
     from torch.nn import LayerNorm as FusedLayerNorm
 
 
-# VQ-VAE building blocks
 class JukeboxConv1D(nn.Module):
     def __init__(self, n_in, n_out, zero_out=False):
         super(JukeboxConv1D, self).__init__()
@@ -339,7 +338,7 @@ def update(params):
 def calculate_strides(strides, downs):
     return [stride**down for stride, down in zip(strides, downs)]
 
-# rename TODO
+
 class JukeboxBottleneckBlock(nn.Module):
     def __init__(self, codebook_dim, codebook_width, mu):
         super().__init__()
@@ -374,22 +373,13 @@ def init_codebook(self, hidden_states):
         self.codebook_sum = self.codebook
         self.codebook_elem = torch.ones(codebook_dim, device=self.codebook.device)
 
-    # def restore_k(self, num_tokens=None, threshold=1.0):
-    #     codebook_dim = self.codebook_dim  # mu -> _
-    #     self.init = True
-    #     self.codebook_sum = self.codebook.clone()
-    #     self.codebook_elem = torch.ones(codebook_dim, device=self.codebook.device)
-    #     if num_tokens is not None:
-    #         expected_usage = num_tokens / codebook_dim
-    #         self.codebook_elem.data.mul_(expected_usage)
-    #         self.codebook_sum.data.mul_(expected_usage)
-    #     self.threshold = threshold
-
     def update_codebook(self, hidden_states, latent_states):
         mu, codebook_width, codebook_dim = self.mu, self.codebook_width, self.codebook_dim
         with torch.no_grad():
             # Calculate new centres
-            latent_states_onehot = torch.zeros(codebook_dim, hidden_states.shape[0], device=hidden_states.device)  # codebook_dim, N * L
+            latent_states_onehot = torch.zeros(
+                codebook_dim, hidden_states.shape[0], device=hidden_states.device
+            )  # codebook_dim, N * L
             latent_states_onehot.scatter_(0, latent_states.view(1, hidden_states.shape[0]), 1)
 
             _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)  # codebook_dim, w
@@ -402,8 +392,14 @@ def update_codebook(self, hidden_states, latent_states):
             self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum  # w, codebook_dim
             self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem  # codebook_dim
             usage = (self.codebook_elem.view(codebook_dim, 1) >= self.threshold).float()
-            self.codebook = usage * (self.codebook_sum.view(codebook_dim, codebook_width) / self.codebook_elem.view(codebook_dim, 1)) + (1 - usage) * _random_codebook
-            _codebook_prob = _codebook_elem / torch.sum(_codebook_elem)  # latent_states_onehot.mean(dim=-1)  # prob of each bin
+            self.codebook = (
+                usage
+                * (self.codebook_sum.view(codebook_dim, codebook_width) / self.codebook_elem.view(codebook_dim, 1))
+                + (1 - usage) * _random_codebook
+            )
+            _codebook_prob = _codebook_elem / torch.sum(
+                _codebook_elem
+            )  # latent_states_onehot.mean(dim=-1)  # prob of each bin
             entropy = -torch.sum(_codebook_prob * torch.log(_codebook_prob + 1e-8))  # entropy ie how diverse
             used_curr = (_codebook_elem >= self.threshold).sum()
             usage = torch.sum(usage)
@@ -431,7 +427,7 @@ def preprocess(self, hidden_states):
     def postprocess(self, latent_states, dequantised_states, x_shape):
         # [NT, C] -> NTC -> NCT
         N, T = x_shape
-        dequantised_states= dequantised_states.view(N, T, -1).permute(0, 2, 1).contiguous()
+        dequantised_states = dequantised_states.view(N, T, -1).permute(0, 2, 1).contiguous()
         latent_states = latent_states.view(N, T)
         return latent_states, dequantised_states
 
@@ -471,7 +467,9 @@ def decode(self, music_tokens):
         dequantised_states = self.dequantise(music_tokens)
 
         # Postprocess
-        dequantised_states = dequantised_states.view(samples, seq_length, self.codebook_width).permute(0, 2, 1).contiguous()
+        dequantised_states = (
+            dequantised_states.view(samples, seq_length, self.codebook_width).permute(0, 2, 1).contiguous()
+        )
         return dequantised_states
 
     def forward(self, hidden_states, update_codebook=True):
@@ -486,7 +484,7 @@ def forward(self, hidden_states, update_codebook=True):
 
         # Quantise and dequantise through bottleneck
         music_tokens, fit = self.quantise(hidden_states)
-        dequantised_states= self.dequantise(music_tokens)
+        dequantised_states = self.dequantise(music_tokens)
 
         # Update embeddings
         if update_codebook:
@@ -498,10 +496,10 @@ def forward(self, hidden_states, update_codebook=True):
         commit_loss = torch.norm(dequantised_states.detach() - hidden_states) ** 2 / np.prod(hidden_states.shape)
 
         # Passthrough
-        dequantised_states= hidden_states + (dequantised_states- hidden_states).detach()
+        dequantised_states = hidden_states + (dequantised_states - hidden_states).detach()
 
         # Postprocess
-        latent_states, dequantised_states= self.postprocess(latent_states, dequantised_states, (samples, seq_length))
+        latent_states, dequantised_states = self.postprocess(latent_states, dequantised_states, (samples, seq_length))
         return latent_states, dequantised_states, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
 
 
@@ -530,7 +528,9 @@ def forward(self, input_audio):
         for level in range(self.levels):
             level_block = self.level_blocks[-level - 1]
             hidden_states = input_audio[level]
-            sampled_tokens, quantised_states, commit_loss, metric = level_block(hidden_states, update_codebook=self.training)
+            sampled_tokens, quantised_states, commit_loss, metric = level_block(
+                hidden_states, update_codebook=self.training
+            )
             music_tokens.append(sampled_tokens)
             if not self.training:
                 # Be extra paranoid and make sure the encoder weights can't
@@ -597,12 +597,22 @@ def _block_kwargs(level):
 
         def encoder(level):
             return Encoder(
-                x_channels, codebook_width, level + 1, downs_t[: level + 1], strides_t[: level + 1], **_block_kwargs(level)
+                x_channels,
+                codebook_width,
+                level + 1,
+                downs_t[: level + 1],
+                strides_t[: level + 1],
+                **_block_kwargs(level),
             )
 
         def decoder(level):
             return Decoder(
-                x_channels, codebook_width, level + 1, downs_t[: level + 1], strides_t[: level + 1], **_block_kwargs(level)
+                x_channels,
+                codebook_width,
+                level + 1,
+                downs_t[: level + 1],
+                strides_t[: level + 1],
+                **_block_kwargs(level),
             )
 
         self.encoders = nn.ModuleList()
@@ -662,9 +672,7 @@ def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
         for chunk_i in audio_chunks:
             music_tokens_i = self._encode(chunk_i, start_level=start_level, end_level=end_level)
             music_tokens_list.append(music_tokens_i)
-        music_tokens = [
-            torch.cat(music_tokens_level, dim=0) for music_tokens_level in zip(*music_tokens_list)
-        ]
+        music_tokens = [torch.cat(music_tokens_level, dim=0) for music_tokens_level in zip(*music_tokens_list)]
         return music_tokens
 
     def sample(self, n_samples):
@@ -699,7 +707,9 @@ def forward(self, raw_audio):
         return dequantised_state, loss
 
 
-# Scalable transformer
+# Jukebox autoregressive model and its building blocks
+
+
 class JukeboxMLP(nn.Module):
     def __init__(self, width, n_state, resid_dropout=0.0, afn="gelu", zero_out=False, init_scale=1.0):
         # a single channel is always used in original code
@@ -771,7 +781,7 @@ def __init__(
         width,
         n_ctx,
         n_state,
-        n_head,
+        num_heads,
         attn_dropout=0.0,
         resid_dropout=0.0,
         scale=True,
@@ -789,7 +799,7 @@ def __init__(
         self.width = width  # should have a better name
         self.n_ctx = n_ctx  # NOTE: n_ctx could be different within operations. This is complete n_ctx
         self.n_state = n_state
-        self.n_head = n_head
+        self.num_heads = num_heads
         self.scale = scale
         self.mask = mask
         if attn_func == 6:
@@ -829,57 +839,69 @@ def __init__(
         self.record_attn = False
         self.w = None
 
-    def _attn(self, q, k, v, sample):
-        scale = 1.0 / math.sqrt(math.sqrt(self.n_state // self.n_head))
+    def _attn(self, query_states, key_states, value_states, sample):
+        scale = 1.0 / math.sqrt(math.sqrt(self.n_state // self.num_heads))
         if self.training:
-            w = torch.matmul(q * scale, k * scale)
+            attention_weight = torch.matmul(query_states * scale, key_states * scale)
         else:
-            w = torch.matmul(q, k)
-            w.mul_(scale * scale)
-        wtype = w.dtype
-        w = w.float()
+            attention_weight = torch.matmul(query_states, key_states)
+            attention_weight.mul_(scale * scale)
+        wtype = attention_weight.dtype
+        attention_weight = attention_weight.float()
         if self.mask:
             # Generate appropriate mask to mask out all positions before current
             # Might take up lot of memory for dense, so can cache it
             mask = get_mask(
-                self.attn_mask, q.size(-2), k.size(-1), self.blocks, self.spread, w.device, sample, self.sample_t
+                self.attn_mask,
+                query_states.size(-2),
+                key_states.size(-1),
+                self.blocks,
+                self.spread,
+                attention_weight.device,
+                sample,
+                self.sample_t,
             )
             if mask is not None:
                 # print(mask)
-                w = w * mask + -1e9 * (1 - mask)
-            w = F.softmax(w, dim=-1).type(wtype)
+                attention_weight = attention_weight * mask + -1e9 * (1 - mask)
+            attention_prob = F.softmax(attention_weight, dim=-1).type(wtype)
         else:
-            w = F.softmax(w, dim=-1).type(wtype)
+            attention_prob = F.softmax(attention_weight, dim=-1).type(wtype)
         if self.record_attn:
-            self.w = w  # .float().cpu().numpy()
+            self.attention_prob = attention_prob  # .float().cpu().numpy()
             if self.attn_func == 7:
                 # only keep music queries and lyrics keys/values
-                self.w = self.w[:, :, self.prime_len :, : self.prime_len]
-        w = self.attn_dropout(w)
-        a = torch.matmul(w, v)
-        return a
-
-    def merge_heads(self, x):
-        x = x.permute(0, 2, 1, 3).contiguous()
-        new_x_shape = (*x.size()[:-2], x.size(-2) * x.size(-1))
-        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
-
-    def split_heads(self, x, k=False):
-        new_x_shape = (*x.size()[:-1], self.n_head, x.size(-1) // self.n_head)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+                self.attention_prob = self.attention_prob[:, :, self.prime_len :, : self.prime_len]
+        attention_prob = self.attn_dropout(attention_prob)
+        context_states = torch.matmul(attention_prob, value_states)
+        return context_states
+
+    def merge_heads(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+        new_hidden_states_shape = (*hidden_states.size()[:-2], hidden_states.size(-2) * hidden_states.size(-1))
+        return hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, hidden_states, k=False):
+        new_hidden_states_shape = (
+            *hidden_states.size()[:-1],
+            self.num_heads,
+            hidden_states.size(-1) // self.num_heads,
+        )
+        hidden_states = hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct split_states
         if k:
-            return x.permute(0, 2, 3, 1)
+            return hidden_states.permute(0, 2, 3, 1)
         else:
-            return x.permute(0, 2, 1, 3)
+            return hidden_states.permute(0, 2, 1, 3)
 
     def dense_attn(self, query, key, value, sample):
         query = self.split_heads(query)
         key = self.split_heads(key, k=True)
         value = self.split_heads(value)
-        a = self._attn(query, key, value, sample)
-        a = self.merge_heads(a)
-        return a
+        context_states = self._attn(query, key, value, sample)
+        context_states = self.merge_heads(context_states)
+        return context_states
 
+    # TODO rename here too
     def block_attn(self, q, k, v, sample):
         _, block_ctx = (
             self.blocks,
@@ -1183,7 +1205,7 @@ def __init__(
         self,
         width,
         n_ctx,
-        n_head,
+        num_heads,
         attn_dropout=0.0,
         resid_dropout=0.0,
         afn="gelu",
@@ -1207,7 +1229,7 @@ def __init__(
             width=width,
             n_ctx=n_ctx,
             n_state=int(m_attn * width),
-            n_head=n_head,
+            num_heads=num_heads,
             attn_dropout=attn_dropout,
             resid_dropout=resid_dropout,
             scale=scale,
@@ -1237,18 +1259,22 @@ def __init__(
         # TODO either support checkpointing for faster inference or get rid of this
         self.checkpoint_attn = checkpoint_attn
         self.checkpoint_mlp = checkpoint_mlp
-        
+
         self.width = width
         self.attn_func = attn_func
 
-    def forward(self, x, encoder_kv, sample=False):
-        a = self.attn(self.ln_0(x), encoder_kv, sample)
-        m = self.mlp(self.ln_1(x + a))
+    def forward(self, hidden_states, encoder_key_value, sample=False):
+        residuals = hidden_states
+        hidden_states = self.ln_0(hidden_states)
+        hidden_states = self.attn(hidden_states, encoder_key_value, sample)
+
+        output_states = self.ln_1(residuals + hidden_states)
+        output_states = self.mlp(output_states)
         if self.res_scale == 1.0:
-            h = x + a + m
+            output = residuals + hidden_states + output_states
         else:
-            h = x + self.res_scale * (a + m)
-        return h
+            output = residuals + self.res_scale * (hidden_states + output_states)
+        return output
 
 
 class JukeboxTransformer(nn.Module):
@@ -1256,7 +1282,7 @@ def __init__(
         self,
         width,
         n_ctx,
-        n_head,
+        num_heads,
         n_depth,
         attn_dropout=0.0,
         resid_dropout=0.0,
@@ -1285,7 +1311,7 @@ def __init__(
         if blocks is not None:
             self.block_ctx = n_ctx // blocks
         self.prime_len = prime_len
-        self.n_head = n_head
+        self.num_heads = num_heads
 
         res_scale = 1.0 / n_depth if res_scale else 1.0
 
@@ -1317,7 +1343,7 @@ def attn_block(d):
             return JukeboxBlock(
                 width=width,
                 n_ctx=n_ctx,
-                n_head=n_head,
+                num_heads=num_heads,
                 attn_dropout=attn_dropout,
                 resid_dropout=resid_dropout,
                 afn=afn,
@@ -1367,29 +1393,29 @@ def _should_record_attn(layer_idx):
             for l in self._attn_mods:
                 l.attn.w = None
 
-    def forward(self, x, encoder_kv=None, sample=False, fp16=False, fp16_out=False):
+    def forward(self, hidden_states, encoder_kv=None, sample=False, fp16=False, fp16_out=False):
         if fp16:
-            x = x.half()
+            hidden_states = hidden_states.half()
 
         # Blocks
-        for i, l in enumerate(self._attn_mods):
-            if l.attn_func == 6:
-                x = l(x, encoder_kv=encoder_kv, sample=sample)
+        for i, attn_layer in enumerate(self._attn_mods):
+            if attn_layer.attn_func == 6:
+                hidden_states = attn_layer(hidden_states, encoder_kv=encoder_kv, sample=sample)
             else:
-                x = l(x, encoder_kv=None, sample=sample)
-            if l.attn.record_attn:
-                self.ws.append(l.attn.w)
+                hidden_states = attn_layer(hidden_states, encoder_kv=None, sample=sample)
+            if attn_layer.attn.record_attn:
+                self.ws.append(attn_layer.attn.w)
         if not fp16_out:
-            x = x.float()
-        return x
+            hidden_states = hidden_states.float()
+        return hidden_states
 
     def check_cache(self, n_samples, sample_t, fp16):
-        for l in self._attn_mods:
-            l.attn.check_cache(n_samples, sample_t, fp16)
+        for attn_layer in self._attn_mods:
+            attn_layer.attn.check_cache(n_samples, sample_t, fp16)
 
     def del_cache(self):
-        for l in self._attn_mods:
-            l.attn.del_cache()
+        for attn_layer in self._attn_mods:
+            attn_layer.attn.del_cache()
 
 
 class JukeboxPositionalEmbedding(nn.Module):
@@ -1408,6 +1434,7 @@ def forward(self):
         return pos_emb
 
 
+# Most important renaming has to happen here
 class JukeboxConditionalAutoregressive(nn.Module):
     def __init__(
         self,
@@ -1466,7 +1493,7 @@ def __init__(
         self.transformer = JukeboxTransformer(
             width=width,
             n_ctx=input_dims,
-            n_head=heads,
+            num_heads=heads,
             n_depth=depth,
             attn_dropout=attn_dropout,
             resid_dropout=resid_dropout,
@@ -1832,7 +1859,7 @@ def __init__(
         self.x_emb = nn.Embedding(bins, out_width)
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
 
-        # MusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level? 
+        # MusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level?
         self.cond = DecoderConvBock(
             self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale
         )
@@ -1850,10 +1877,10 @@ def postprocess(self, hidden_states):
     # TODO rename to raw audio and hidden states
     def forward(self, music_tokens, raw_audio_conditionning=None):
         """
-            Args : 
-                - music_tokens : indexes of codebook vectors
-                - raw_audio_conditionning : used when prime sampling, raw audio information that conditions 
-                the generation
+        Args :
+            - music_tokens : indexes of codebook vectors
+            - raw_audio_conditionning : used when prime sampling, raw audio information that conditions
+            the generation
         """
         if raw_audio_conditionning is None:
             raw_audio_conditionning = 0.0
@@ -2045,7 +2072,7 @@ def rescale(z_shape):
             bins=config.l_bins,
             width=config.width[-level - 1],
             depth=config.depth[-level - 1],
-            heads=config.n_heads[-level - 1],
+            heads=config.num_headss[-level - 1],
             attn_order=config.attn_order[-level - 1],
             blocks=config.blocks,
             spread=config.spread,
@@ -2511,6 +2538,7 @@ def get_starts(total_length, n_ctx, hop_length):
         starts.append(start)
     return starts
 
+
 # TODO fix this, consumes too much RAM
 def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
     level = level - 1  # Top level used
@@ -2794,7 +2822,15 @@ def _sample(
                 save_wav(logdir, level, metas=metas, aud=raw_audio, sr=self.config.sr)
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0:
                     empty_cache()
-                    alignments = get_alignment(raw_audio, music_tokens, labels[-1], self.priors[-1], level, sampling_kwargs[-1]["fp16"], self.config)
+                    alignments = get_alignment(
+                        raw_audio,
+                        music_tokens,
+                        labels[-1],
+                        self.priors[-1],
+                        level,
+                        sampling_kwargs[-1]["fp16"],
+                        self.config,
+                    )
                     pass  # consumes too much ram
         return music_tokens
 

From fc1a2a1a4b02eaa255e2c799205395cae5ae93de Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 8 Aug 2022 12:51:15 +0000
Subject: [PATCH 079/196] update

---
 src/transformers/models/jukebox/modeling_jukebox.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2dbb92261e0af..8ee6dc6e57f74 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -488,7 +488,7 @@ def forward(self, hidden_states, update_codebook=True):
 
         # Update embeddings
         if update_codebook:
-            update_metrics = self.update_codebook(hidden_states, latent_states)
+            update_metrics = self.update_codebook(hidden_states, music_tokens)
         else:
             update_metrics = {}
 
@@ -499,8 +499,8 @@ def forward(self, hidden_states, update_codebook=True):
         dequantised_states = hidden_states + (dequantised_states - hidden_states).detach()
 
         # Postprocess
-        latent_states, dequantised_states = self.postprocess(latent_states, dequantised_states, (samples, seq_length))
-        return latent_states, dequantised_states, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
+        music_tokens, dequantised_states = self.postprocess(music_tokens, dequantised_states, (samples, seq_length))
+        return music_tokens, dequantised_states, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
 
 
 class JukeboxBottleneck(nn.Module):
@@ -1839,6 +1839,7 @@ def split_chunks(length, chunk_size):
     return chunk_sizes
 
 
+# second most important renaming
 class MusicTokenConditioner(nn.Module):
     """
     The MusicTokenConditioner takes music tokens as an input (coresponding to vocabularies in the VQ-VAE codebook) and
@@ -2540,9 +2541,9 @@ def get_starts(total_length, n_ctx, hop_length):
 
 
 # TODO fix this, consumes too much RAM
-def get_alignment(x, music_tokens, labels, prior, level, fp16, hps):
+def get_alignment(music_tokens, labels, prior, level, fp16, hps):
     level = level - 1  # Top level used
-    n_ctx, n_tokens = prior.n_ctx, prior.n_tokens
+    n_ctx = prior.n_ctx
     tokens = music_tokens[level]
     bs, total_length = tokens.shape[0], tokens.shape[1]
     if total_length < n_ctx:
@@ -2641,6 +2642,7 @@ def load_prompts(audio_files, duration, hps):
     return raw_audio
 
 
+# a little bit of renaming to do here, especially regarind "z"
 @add_start_docstrings(
     "The bare JUKEBOX Model from which you can sample",
     JUKEBOX_START_DOCSTRING,
@@ -2823,7 +2825,6 @@ def _sample(
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0:
                     empty_cache()
                     alignments = get_alignment(
-                        raw_audio,
                         music_tokens,
                         labels[-1],
                         self.priors[-1],

From d3881833f0a86222efa87aad4ddac22588a1ede2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 8 Aug 2022 14:46:38 +0000
Subject: [PATCH 080/196] major renaming

---
 .../models/jukebox/modeling_jukebox.py        | 795 +++++++++---------
 1 file changed, 396 insertions(+), 399 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 8ee6dc6e57f74..e87fd9573e057 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -64,9 +64,9 @@ def empty_cache():
     torch.cuda.empty_cache()
 
 
-def get_range(x):
+def get_range(hidden_states):
     return tqdm(
-        x, leave=True, file=sys.stdout, bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
+        hidden_states, leave=True, file=sys.stdout, bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
     )
 
 
@@ -95,13 +95,13 @@ def __init__(self, n_in, n_out, zero_out=False):
         self.weight = nn.Parameter(w)
         self.bias = nn.Parameter(b)
 
-    def forward(self, x):
-        size_out = (*x.size()[:-1], self.n_out)
-        x = torch.addmm(
-            self.bias.type_as(x), x.view(-1, x.size(-1)), self.weight.type_as(x)
-        )  # If x if float then float else half
-        x = x.view(*size_out)
-        return x
+    def forward(self, hidden_states):
+        size_out = (*hidden_states.size()[:-1], self.n_out)
+        hidden_states = torch.addmm(
+            self.bias.type_as(hidden_states), hidden_states.view(-1, hidden_states.size(-1)), self.weight.type_as(hidden_states)
+        )  # If hidden_states if float then float else half
+        hidden_states = hidden_states.view(*size_out)
+        return hidden_states
 
 
 class ResConv1DBlock(nn.Module):
@@ -310,8 +310,8 @@ def level_block(level, down_t, stride_t):
 
         self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
 
-    def forward(self, xs, all_levels=True):
-        hidden_states = xs[-1]
+    def forward(self, sampled_audio, all_levels=True):
+        hidden_states = sampled_audio[-1]
 
         # 32, 64 ...
         for level in reversed(range(self.levels)):
@@ -319,7 +319,7 @@ def forward(self, xs, all_levels=True):
             hidden_states = level_block(hidden_states)
 
             if level != 0 and all_levels:
-                hidden_states = hidden_states + xs[level - 1]
+                hidden_states = hidden_states + sampled_audio[level - 1]
 
         hidden_states = self.out(hidden_states)
         return hidden_states
@@ -478,7 +478,7 @@ def forward(self, hidden_states, update_codebook=True):
         # Preprocess
         hidden_states, prenorm = self.preprocess(hidden_states)
 
-        # Init k if not inited
+        # Init key if not inited
         if update_codebook and not self.init:
             self.init_codebook(hidden_states)
 
@@ -512,7 +512,7 @@ def __init__(self, codebook_dim, codebook_width, mu, levels):
             self.level_blocks.append(JukeboxBottleneckBlock(codebook_dim, codebook_width, mu))
 
     def encode(self, raw_audio):
-        music_tokens = [level_block.encode(x) for (level_block, x) in zip(self.level_blocks, raw_audio)]
+        music_tokens = [level_block.encode(hidden_states) for (level_block, hidden_states) in zip(self.level_blocks, raw_audio)]
         return music_tokens
 
     def decode(self, music_tokens, start_level=0, end_level=None):
@@ -582,7 +582,7 @@ def __init__(self, config):
         self.downsamples = calculate_strides(strides_t, downs_t)
         self.hop_lengths = np.cumprod(self.downsamples)
         self.levels = levels = config.vq_vae_levels
-        self.z_shapes = [(int(x_shape[0] // self.hop_lengths[-level - 1]),) for level in range(levels)]
+        self.music_tokens_shapes = [(int(x_shape[0] // self.hop_lengths[-level - 1]),) for level in range(levels)]
 
         if multipliers is None:
             self.multipliers = [1] * levels
@@ -677,7 +677,7 @@ def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
 
     def sample(self, n_samples):
         music_tokens = [
-            torch.randint(0, self.codebook_dim, size=(n_samples, *z_shape), device="cpu") for z_shape in self.z_shapes
+            torch.randint(0, self.codebook_dim, size=(n_samples, *music_tokens_shape), device="cpu") for music_tokens_shape in self.music_tokens_shapes
         ]
         return self.decode(music_tokens)
 
@@ -691,11 +691,11 @@ def forward(self, raw_audio):
             latent_state = encoder(input_audio)
             latent_states.append(latent_state[-1])
 
-        _, quantised_audio, commit_losses, _ = self.bottleneck(latent_states)
+        _, music_tokens, commit_losses, _ = self.bottleneck(latent_states)
         dequantised_states = []
         for level in range(self.levels):
             decoder = self.decoders[level]
-            dequantised_state = decoder(quantised_audio[level : level + 1], all_levels=False)
+            dequantised_state = decoder(music_tokens[level : level + 1], all_levels=False)
             dequantised_state.append(dequantised_state)
 
         for level in reversed(range(self.levels)):
@@ -740,18 +740,18 @@ def forward(self, input):
             return super(JukeboxLayerNorm, self).forward(input.float()).type_as(input)
 
 
-def repeat(x, n, dim):
+def repeat(hidden_states, n_repeat, dim):
     if dim == -1:
-        dim = len(x.shape) - 1
+        dim = len(hidden_states.shape) - 1
     return (
-        x.view(int(np.prod(x.shape[: dim + 1])), 1, int(np.prod(x.shape[dim + 1 :])))
-        .repeat(1, n, 1)
-        .view(*x.shape[:dim], n * x.shape[dim], *x.shape[dim + 1 :])
+        hidden_states.view(int(np.prod(hidden_states.shape[: dim + 1])), 1, int(np.prod(hidden_states.shape[dim + 1 :])))
+        .repeat(1, n_repeat, 1)
+        .view(*hidden_states.shape[:dim], n_repeat * hidden_states.shape[dim], *hidden_states.shape[dim + 1 :])
     )
 
 
 def get_mask(mask, q_l, kv_l, blocks, spread, device, sample, sample_t):
-    # returns a mask of shape 1 x 1 x q_l x kv_l or None if masking is not needed.
+    # returns a mask of shape 1 hidden_states 1 hidden_states q_l hidden_states kv_l or None if masking is not needed.
     if mask is None or q_l == 1:
         return None
     offset = sample_t - q_l if sample else max(kv_l - q_l, 0)
@@ -824,7 +824,7 @@ def __init__(
             7: (self.prime_qkv, self.prime_attn, "prime"),
         }[
             attn_func
-        ]  # Attend to last k position of each block
+        ]  # Attend to last key position of each block
 
         self.blocks = blocks
         self.spread = spread
@@ -846,7 +846,7 @@ def _attn(self, query_states, key_states, value_states, sample):
         else:
             attention_weight = torch.matmul(query_states, key_states)
             attention_weight.mul_(scale * scale)
-        wtype = attention_weight.dtype
+        attn_weight_type = attention_weight.dtype
         attention_weight = attention_weight.float()
         if self.mask:
             # Generate appropriate mask to mask out all positions before current
@@ -862,13 +862,10 @@ def _attn(self, query_states, key_states, value_states, sample):
                 self.sample_t,
             )
             if mask is not None:
-                # print(mask)
                 attention_weight = attention_weight * mask + -1e9 * (1 - mask)
-            attention_prob = F.softmax(attention_weight, dim=-1).type(wtype)
-        else:
-            attention_prob = F.softmax(attention_weight, dim=-1).type(wtype)
+        attention_prob = F.softmax(attention_weight, dim=-1).type(attn_weight_type)
         if self.record_attn:
-            self.attention_prob = attention_prob  # .float().cpu().numpy()
+            self.attention_prob = attention_prob 
             if self.attn_func == 7:
                 # only keep music queries and lyrics keys/values
                 self.attention_prob = self.attention_prob[:, :, self.prime_len :, : self.prime_len]
@@ -902,160 +899,160 @@ def dense_attn(self, query, key, value, sample):
         return context_states
 
     # TODO rename here too
-    def block_attn(self, q, k, v, sample):
+    def block_attn(self, query, key, value, sample):
         _, block_ctx = (
             self.blocks,
             self.block_ctx,
         )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, seq_length, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
-            assert l == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
-            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+            assert seq_length == self._suff_cache_len(), f"{seq_length} != {self._suff_cache_len()}"
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
-            ql = q.shape[1]
-            q = q.view(bs * ql // block_ctx, block_ctx, d)
-            if ql < l:
-                l = ql
-                k = k[:, -l:].contiguous()
-                v = v[:, -l:].contiguous()
-            k = k.view(bs * l // block_ctx, block_ctx, d)
-            v = v.view(bs * l // block_ctx, block_ctx, d)
-            return self.dense_attn(q, k, v, sample).view(bs, l, d)
-
-    def transpose_block_attn(self, q, k, v, sample):
+            query_length = query.shape[1]
+            query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
+            if query_length < seq_length:
+                seq_length = query_length
+                key = key[:, -seq_length:].contiguous()
+                value = value[:, -seq_length:].contiguous()
+            key = key.view(batch_size * seq_length // block_ctx, block_ctx, embed_dim)
+            value = value.view(batch_size * seq_length // block_ctx, block_ctx, embed_dim)
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_length, embed_dim)
+
+    def transpose_block_attn(self, query, key, value, sample):
         _, block_ctx = (
             self.blocks,
             self.block_ctx,
         )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, l, d = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
             block_l = (l - 1) % block_ctx
-            k = k[:, block_l::block_ctx, :]
-            v = v[:, block_l::block_ctx, :]
-            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+            key = key[:, block_l::block_ctx, :]
+            value = value[:, block_l::block_ctx, :]
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, d)
         else:
-            ql = q.shape[1]
-            q = (
-                q.view(bs, ql // block_ctx, block_ctx, d)
+            ql = query.shape[1]
+            query = (
+                query.view(batch_size, ql // block_ctx, block_ctx, d)
                 .transpose(1, 2)
                 .contiguous()
-                .view(bs * block_ctx, ql // block_ctx, d)
+                .view(batch_size * block_ctx, ql // block_ctx, d)
             )
-            k = (
-                k.view(bs, l // block_ctx, block_ctx, d)
+            key = (
+                key.view(batch_size, l // block_ctx, block_ctx, d)
                 .transpose(1, 2)
                 .contiguous()
-                .view(bs * block_ctx, l // block_ctx, d)
+                .view(batch_size * block_ctx, l // block_ctx, d)
             )
-            v = (
-                v.view(bs, l // block_ctx, block_ctx, d)
+            value = (
+                value.view(batch_size, l // block_ctx, block_ctx, d)
                 .transpose(1, 2)
                 .contiguous()
-                .view(bs * block_ctx, l // block_ctx, d)
+                .view(batch_size * block_ctx, l // block_ctx, d)
             )
             return (
-                self.dense_attn(q, k, v, sample)
-                .view(bs, block_ctx, ql // block_ctx, d)
+                self.dense_attn(query, key, value, sample)
+                .view(batch_size, block_ctx, ql // block_ctx, d)
                 .transpose(1, 2)
                 .contiguous()
-                .view(bs, ql, d)
+                .view(batch_size, ql, d)
             )
 
-    def prev_block_attn(self, q, k, v, sample):
+    def prev_block_attn(self, query, key, value, sample):
         _, block_ctx = (
             self.blocks,
             self.block_ctx,
         )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, l, d = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
             assert l == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
             block = (l - 1) // block_ctx
             prev_l = (block - 1) * block_ctx
             if block > 0:
                 assert prev_l == 0
-                k = k[:, prev_l : prev_l + block_ctx, :]
-                v = v[:, prev_l : prev_l + block_ctx, :]
+                key = key[:, prev_l : prev_l + block_ctx, :]
+                value = value[:, prev_l : prev_l + block_ctx, :]
             else:
-                k = torch.zeros(bs, block_ctx, d, device=q.device, dtype=q.dtype)
-                v = torch.zeros(bs, block_ctx, d, device=q.device, dtype=q.dtype)
-            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+                key = torch.zeros(batch_size, block_ctx, d, device=query.device, dtype=query.dtype)
+                value = torch.zeros(batch_size, block_ctx, d, device=query.device, dtype=query.dtype)
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, d)
         else:
-            ql = q.shape[1]
-            q = q.view(bs * ql // block_ctx, block_ctx, d)
-            k = torch.nn.functional.pad(
-                k.view(bs, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
-            ).view(bs * l // block_ctx, block_ctx, d)
-            v = torch.nn.functional.pad(
-                v.view(bs, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
-            ).view(bs * l // block_ctx, block_ctx, d)
+            ql = query.shape[1]
+            query = query.view(batch_size * ql // block_ctx, block_ctx, d)
+            key = torch.nn.functional.pad(
+                key.view(batch_size, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
+            ).view(batch_size * l // block_ctx, block_ctx, d)
+            value = torch.nn.functional.pad(
+                value.view(batch_size, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
+            ).view(batch_size * l // block_ctx, block_ctx, d)
             if ql < l:
                 qb = ql // block_ctx
                 kb = l // block_ctx
                 l = ql
-                k = k.view(bs, kb, block_ctx, d)[:, -qb:].contiguous().view(bs * qb, block_ctx, d)
-                v = v.view(bs, kb, block_ctx, d)[:, -qb:].contiguous().view(bs * qb, block_ctx, d)
-            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+                key = key.view(batch_size, kb, block_ctx, d)[:, -qb:].contiguous().view(batch_size * qb, block_ctx, d)
+                value = value.view(batch_size, kb, block_ctx, d)[:, -qb:].contiguous().view(batch_size * qb, block_ctx, d)
+            return self.dense_attn(query, key, value, sample).view(batch_size, l, d)
 
-    def summary_attn(self, q, k, v, sample):
+    def summary_attn(self, query, key, value, sample):
         blocks, block_ctx = (
             self.blocks,
             self.block_ctx,
         )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, l, d = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
-            k = torch.nn.functional.pad(k[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
-            v = torch.nn.functional.pad(v[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
-            return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+            key = torch.nn.functional.pad(key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
+            value = torch.nn.functional.pad(value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, d)
         else:
-            k = torch.nn.functional.pad(
-                k.view(bs, blocks, l // blocks, d)[:, :-1, -1, :], (0, 0, 1, 0)
-            )  # bs, blocks, d
-            v = torch.nn.functional.pad(
-                v.view(bs, blocks, l // blocks, d)[:, :-1, -1, :], (0, 0, 1, 0)
-            )  # bs, blocks, d
-            return self.dense_attn(q, k, v, sample).view(bs, l, d)
-
-    def summary_spread_attn(self, q, k, v, sample):
+            key = torch.nn.functional.pad(
+                key.view(batch_size, blocks, l // blocks, d)[:, :-1, -1, :], (0, 0, 1, 0)
+            )  # batch_size, blocks, d
+            value = torch.nn.functional.pad(
+                value.view(batch_size, blocks, l // blocks, d)[:, :-1, -1, :], (0, 0, 1, 0)
+            )  # batch_size, blocks, d
+            return self.dense_attn(query, key, value, sample).view(batch_size, l, d)
+
+    def summary_spread_attn(self, query, key, value, sample):
         blocks, _, spread = (
             self.blocks,
             self.block_ctx,
             self.spread,
         )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        bs, l, d = v.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, l, d = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
             assert False, "Not yet implemented"
-            # k = torch.nn.functional.pad(k,(0,0,block_ctx,(-l)%block_ctx)).view(bs, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(bs, -1, d)
-            # v = torch.nn.functional.pad(v,(0,0,block_ctx,(-l)%block_ctx)).view(bs, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(bs, -1, d)
-            # return self.dense_attn(q, k, v, sample).view(bs, 1, d)
+            # key = torch.nn.functional.pad(k,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, d)
+            # value = torch.nn.functional.pad(value,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, d)
+            # return self.dense_attn(query, key, value, sample).view(batch_size, 1, d)
         else:
-            k = (
-                torch.nn.functional.pad(k.view(bs, blocks, l // blocks, d)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
+            key = (
+                torch.nn.functional.pad(key.view(batch_size, blocks, l // blocks, d)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
                 .contiguous()
-                .view(bs, blocks * spread, d)
-            )  # bs, blocks * spread, d
-            v = (
-                torch.nn.functional.pad(v.view(bs, blocks, l // blocks, d)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
+                .view(batch_size, blocks * spread, d)
+            )  # batch_size, blocks * spread, d
+            value = (
+                torch.nn.functional.pad(value.view(batch_size, blocks, l // blocks, d)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
                 .contiguous()
-                .view(bs, blocks * spread, d)
-            )  # bs, blocks * spread, d
-            return self.dense_attn(q, k, v, sample).view(bs, l, d)
+                .view(batch_size, blocks * spread, d)
+            )  # batch_size, blocks * spread, d
+            return self.dense_attn(query, key, value, sample).view(batch_size, l, d)
 
-    def prime_attn(self, q, k, v, sample):
+    def prime_attn(self, query, key, value, sample):
         prime_len = self._prime_len
-        k = k[:, :prime_len]
-        v = v[:, :prime_len]
-        return self.dense_attn(q, k, v, sample)
+        key = key[:, :prime_len]
+        value = value[:, :prime_len]
+        return self.dense_attn(query, key, value, sample)
 
-    def decode_attn(self, q, k, v, sample):
+    def decode_attn(self, query, key, value, sample):
         assert (
-            k.shape[1] == v.shape[1] == self.encoder_dims
-        ), f"k: {k.shape}, v: {v.shape}, enc_dims: {self.encoder_dims}"
-        return self.dense_attn(q, k, v, sample)
-
-    def factored_qkv(self, x, encoder_kv=None, sample=False):
-        curr_ctx = x.shape[1]
-        assert encoder_kv is None
-        query, key, value = x.chunk(3, dim=2)
+            key.shape[1] == value.shape[1] == self.encoder_dims
+        ), f"k: {key.shape}, v: {value.shape}, enc_dims: {self.encoder_dims}"
+        return self.dense_attn(query, key, value, sample)
+
+    def factored_qkv(self, hidden_states, encoder_key_value=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        assert encoder_key_value is None
+        query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
             self.sample_t += curr_ctx
             key, value = self._append_cache(key, value)
@@ -1073,10 +1070,10 @@ def factored_qkv(self, x, encoder_kv=None, sample=False):
                 value = self.cache["value"]
         return query, key, value, sample
 
-    def prime_qkv(self, x, encoder_kv=None, sample=False):
-        curr_ctx = x.shape[1]
-        assert encoder_kv is None
-        query, key, value = x.chunk(3, dim=2)
+    def prime_qkv(self, hidden_states, encoder_key_value=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        assert encoder_key_value is None
+        query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
             if self._cache_len() < self._prime_len:
                 self._append_cache(key, value)
@@ -1086,22 +1083,22 @@ def prime_qkv(self, x, encoder_kv=None, sample=False):
             self.sample_t += curr_ctx
         return query, key, value, sample
 
-    def decode_qkv(self, x, encoder_kv=None, sample=False):
-        curr_ctx = x.shape[1]
-        query = x
+    def decode_qkv(self, hidden_states, encoder_key_value=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        query = hidden_states
         if sample:
             if self.sample_t == 0:
-                self.cache["key"], self.cache["value"] = self.c_enc_kv(encoder_kv.type_as(x)).chunk(2, dim=2)
+                self.cache["key"], self.cache["value"] = self.c_enc_kv(encoder_key_value.type_as(hidden_states)).chunk(2, dim=2)
             key, value = self.cache["key"], self.cache["value"]
             self.sample_t += curr_ctx
         else:
-            key, value = self.c_enc_kv(encoder_kv.type_as(x)).chunk(2, dim=2)
+            key, value = self.c_enc_kv(encoder_key_value.type_as(hidden_states)).chunk(2, dim=2)
         return query, key, value, sample
 
-    def forward(self, x, encoder_kv=None, sample=False):
-        curr_ctx = x.shape[1]
-        x = self.c_attn(x)
-        query, key, value, sample = self.qkv(x, encoder_kv=encoder_kv, sample=sample)
+    def forward(self, hidden_states, encoder_key_value=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        hidden_states = self.c_attn(hidden_states)
+        query, key, value, sample = self.qkv(hidden_states, encoder_key_value=encoder_key_value, sample=sample)
         a = self.attn(query, key, value, sample)
         if a.shape[1] != curr_ctx:
             offset = self._offset(curr_ctx)
@@ -1120,15 +1117,15 @@ def _offset(self, curr_ctx):
             return 0
         return (self.sample_t - curr_ctx) % self.block_ctx
 
-    def _pad_to_block_ctx(self, x, query=False):
-        l = x.shape[1]
+    def _pad_to_block_ctx(self, hidden_states, query=False):
+        l = hidden_states.shape[1]
         offset = self._offset(l) if query else 0
         n_blocks = (l + offset + self.block_ctx - 1) // self.block_ctx
         pad = n_blocks * self.block_ctx - l - offset
         if pad == 0 and offset == 0:
-            return x
+            return hidden_states
         else:
-            return F.pad(x, (0, 0, offset, pad))
+            return F.pad(hidden_states, (0, 0, offset, pad))
 
     def _cache_len(self):
         return 0 if "key" not in self.cache else self.cache["key"].shape[1]
@@ -1393,16 +1390,16 @@ def _should_record_attn(layer_idx):
             for l in self._attn_mods:
                 l.attn.w = None
 
-    def forward(self, hidden_states, encoder_kv=None, sample=False, fp16=False, fp16_out=False):
+    def forward(self, hidden_states, encoder_key_value=None, sample=False, fp16=False, fp16_out=False):
         if fp16:
             hidden_states = hidden_states.half()
 
         # Blocks
         for i, attn_layer in enumerate(self._attn_mods):
             if attn_layer.attn_func == 6:
-                hidden_states = attn_layer(hidden_states, encoder_kv=encoder_kv, sample=sample)
+                hidden_states = attn_layer(hidden_states, encoder_key_value=encoder_key_value, sample=sample)
             else:
-                hidden_states = attn_layer(hidden_states, encoder_kv=None, sample=sample)
+                hidden_states = attn_layer(hidden_states, encoder_key_value=None, sample=sample)
             if attn_layer.attn.record_attn:
                 self.ws.append(attn_layer.attn.w)
         if not fp16_out:
@@ -1459,8 +1456,8 @@ def __init__(
         attn_order=0,
         blocks=None,
         spread=None,
-        x_cond=False,
-        y_cond=False,
+        audio_conditioning=False,
+        metadata_conditioning=False,
         encoder_dims=0,
         only_encode=False,
         merged_decoder=False,
@@ -1475,14 +1472,14 @@ def __init__(
         self.width = width
         self.depth = depth
 
-        # TODO  rename x to proper name
+        # TODO  rename hidden_states to proper name
         self.x_emb = nn.Embedding(bins, width)
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
         self.x_emb_dropout = nn.Dropout(emb_dropout)
-        # TODO rename y and y_cond to proper names
-        self.y_cond = y_cond
-        self.x_cond = x_cond
-        if not y_cond:
+        # TODO rename y and metadata_conditioning to proper names
+        self.metadata_conditioning = metadata_conditioning
+        self.audio_conditioning = audio_conditioning
+        if not metadata_conditioning:
             self.start_token = nn.Parameter(get_normal(1, width, std=0.01 * init_scale))
 
         self.pos_emb = JukeboxPositionalEmbedding(
@@ -1531,28 +1528,28 @@ def __init__(
                 self.x_out.weight = self.x_emb.weight
             self.loss = torch.nn.CrossEntropyLoss()
 
-    def preprocess(self, x):
-        # Input: x is NHWC and uint8. Converted to NL and long
+    def preprocess(self, hidden_states):
+        # Input: hidden_states is NHWC and uint8. Converted to NL and long
         # Can include stuff like bitpacking, reordering here.
-        N = x.shape[0]
-        return x.view(N, -1).long()
+        N = hidden_states.shape[0]
+        return hidden_states.view(N, -1).long()
 
-    def postprocess(self, x, sample_tokens=None):
+    def postprocess(self, hidden_states, sample_tokens=None):
         # Convert back from NL and long to NHWC
-        N = x.shape[0]
-        assert (0 <= x).all() and (x < self.bins).all()
+        N = hidden_states.shape[0]
+        assert (0 <= hidden_states).all() and (hidden_states < self.bins).all()
         if sample_tokens is None or sample_tokens == self.input_dims:
-            return x.view(N, *self.input_shape)
+            return hidden_states.view(N, *self.input_shape)
         else:
-            return x.view(N, -1)
+            return hidden_states.view(N, -1)
 
-    # TODO RENAME x, x_cond and y_cond, x_prime, x_gen, x_t
+    # TODO RENAME hidden_states, audio_conditioning and metadata_conditioning, x_prime, x_gen, x_t
     def forward(
         self,
-        x,
-        x_cond=None,
-        y_cond=None,
-        encoder_kv=None,
+        hidden_states,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        encoder_key_value=None,
         fp16=False,
         loss_full=False,
         encode=False,
@@ -1562,75 +1559,75 @@ def forward(
     ):
         # Preprocess.
         with torch.no_grad():
-            x = self.preprocess(x)
+            hidden_states = self.preprocess(hidden_states)
 
-        N = x.shape[0]
-        if not self.x_cond:
-            x_cond = torch.zeros((N, 1, self.width), device=x.device, dtype=torch.float)
+        N = hidden_states.shape[0]
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros((N, 1, self.width), device=hidden_states.device, dtype=torch.float)
 
-        x_t = x  # Target
-        x = self.x_emb(x)  # X emb
-        x = roll(x, 1)  # Shift by 1, and fill in start token
-        if self.y_cond:
-            x[:, 0] = y_cond.view(N, self.width)
+        x_t = hidden_states  # Target
+        hidden_states = self.x_emb(hidden_states)  # hidden_states emb
+        hidden_states = roll(hidden_states, 1)  # Shift by 1, and fill in start token
+        if self.metadata_conditioning:
+            hidden_states[:, 0] = metadata_conditioning.view(N, self.width)
         else:
-            x[:, 0] = self.start_token
+            hidden_states[:, 0] = self.start_token
 
-        x = self.x_emb_dropout(x) + self.pos_emb_dropout(self.pos_emb()) + x_cond  # Pos emb and dropout
+        hidden_states = self.x_emb_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning  # Pos emb and dropout
 
-        x = self.transformer(x, encoder_kv=encoder_kv, fp16=fp16)  # Transformer
+        hidden_states = self.transformer(hidden_states, encoder_key_value=encoder_key_value, fp16=fp16)  # Transformer
         if self.add_cond_after_transformer:  # Piped doesnt add x_cond
-            x = x + x_cond
+            hidden_states = hidden_states + audio_conditioning
 
-        acts = x
+        acts = hidden_states
         if self.only_encode:
-            return x
-        x = self.x_out(x)  # Predictions
+            return hidden_states
+        hidden_states = self.x_out(hidden_states)  # Predictions
 
         if get_sep_loss:
             assert self.prime_len is not None
-            x_prime = x[:, : self.prime_len].reshape(-1, self.bins)
-            x_gen = x[:, self.prime_len :].reshape(-1, self.bins)
+            x_prime = hidden_states[:, : self.prime_len].reshape(-1, self.bins)
+            x_gen = hidden_states[:, self.prime_len :].reshape(-1, self.bins)
 
             prime_loss = F.cross_entropy(x_prime, x_t[:, : self.prime_len].reshape(-1)) / np.log(2.0)
             gen_loss = F.cross_entropy(x_gen, x_t[:, self.prime_len :].reshape(-1)) / np.log(2.0)
 
             loss = (prime_loss, gen_loss)  # Note order! Prime is first
         else:
-            loss = F.cross_entropy(x.view(-1, self.bins), x_t.view(-1)) / np.log(2.0)  # Loss
+            loss = F.cross_entropy(hidden_states.view(-1, self.bins), x_t.view(-1)) / np.log(2.0)  # Loss
 
         if get_preds:
-            return loss, x
+            return loss, hidden_states
         elif get_acts:
             return loss, acts
         else:
             return loss, None
 
-    # TODO rename x, x_conds, y_conds
-    def get_emb(self, sample_t, n_samples, x, x_cond, y_cond):
+    # TODO rename hidden_states, x_conds, y_conds
+    def get_emb(self, sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning):
         N, D = n_samples, self.input_dims
         if sample_t == 0:
-            x = torch.empty(n_samples, 1, self.width).to(x_cond.device)
-            if self.y_cond:
-                x[:, 0] = y_cond.view(N, self.width)
+            hidden_states = torch.empty(n_samples, 1, self.width).to(audio_conditioning.device)
+            if self.metadata_conditioning:
+                hidden_states[:, 0] = metadata_conditioning.view(N, self.width)
             else:
-                x[:, 0] = self.start_token
+                hidden_states[:, 0] = self.start_token
         else:
-            x = self.x_emb(x)
-        if x_cond.shape == (N, D, self.width):
-            cond = x_cond[:, sample_t : sample_t + 1, :]
+            hidden_states = self.x_emb(hidden_states)
+        if audio_conditioning.shape == (N, D, self.width):
+            cond = audio_conditioning[:, sample_t : sample_t + 1, :]
         else:
-            cond = x_cond
-        x = x + self.pos_emb()[sample_t : sample_t + 1] + cond  # Pos emb, dropout is identity at eval time
-        return x, cond
+            cond = audio_conditioning
+        hidden_states = hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond  # Pos emb, dropout is identity at eval time
+        return hidden_states, cond
 
-    # TODO rename x, x_conds, y_conds
+    # TODO rename hidden_states, x_conds, y_conds
     def sample(
         self,
         n_samples,
-        x_cond=None,
-        y_cond=None,
-        encoder_kv=None,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        encoder_key_value=None,
         fp16=False,
         temp=1.0,
         top_k=0,
@@ -1642,8 +1639,8 @@ def sample(
             sample_tokens = self.input_dims
         N, _ = n_samples, self.input_dims
 
-        if not self.x_cond:
-            x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros((N, 1, self.width), dtype=torch.float).to(
                 "cpu" if torch.cuda.is_available() else "cpu"
             )
 
@@ -1653,10 +1650,10 @@ def sample(
                 preds = []
 
             for sample_t in get_range(range(0, sample_tokens)):
-                hidden_states, cond = self.get_emb(sample_t, n_samples, tokens, x_cond, y_cond)
+                hidden_states, cond = self.get_emb(sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 hidden_states = self.transformer(
-                    hidden_states, encoder_kv=encoder_kv, sample=True, fp16=fp16
+                    hidden_states, encoder_key_value=encoder_key_value, sample=True, fp16=fp16
                 )  # TODO put fp16 back # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
@@ -1666,7 +1663,7 @@ def sample(
                 # Adjust logits
                 hidden_states = hidden_states / temp
                 hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
-                tokens = torch.distributions.Categorical(logits=hidden_states).sample()  # Sample and replace x
+                tokens = torch.distributions.Categorical(logits=hidden_states).sample()  # Sample and replace hidden_states
                 sampled_tokens.append(tokens.clone())
             del tokens
             self.transformer.del_cache()
@@ -1684,10 +1681,10 @@ def sample(
     def primed_sample(
         self,
         n_samples,
-        x,
-        x_cond=None,
-        y_cond=None,
-        encoder_kv=None,
+        hidden_states,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        encoder_key_value=None,
         fp16=False,
         temp=1.0,
         top_k=0,
@@ -1700,15 +1697,15 @@ def primed_sample(
             sample_tokens = self.input_dims
         # Preprocess.
         with torch.no_grad():
-            x = self.preprocess(x)
+            hidden_states = self.preprocess(hidden_states)
 
-        xs = torch.split(x, 1, dim=1)
-        xs = list(xs)
+        sampled_audio = torch.split(hidden_states, 1, dim=1)
+        sampled_audio = list(sampled_audio)
 
         N, _ = n_samples, self.input_dims
 
-        if not self.x_cond:
-            x_cond = torch.zeros((N, 1, self.width), dtype=torch.float).to(x.device)
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros((N, 1, self.width), dtype=torch.float).to(hidden_states.device)
 
         with torch.no_grad():
             if get_preds:
@@ -1717,28 +1714,28 @@ def primed_sample(
             # Fill up key/value cache for past context by runing forward pass.
             # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage.
             if chunk_size is None:
-                chunk_size = len(xs)
-            # assert len(xs) % chunk_size == 0, f'expected {len(xs)} to be divisible by {chunk_size}'
-            chunk_sizes = split_chunks(len(xs), chunk_size)
+                chunk_size = len(sampled_audio)
+            # assert len(sampled_audio) % chunk_size == 0, f'expected {len(sampled_audio)} to be divisible by {chunk_size}'
+            chunk_sizes = split_chunks(len(sampled_audio), chunk_size)
             x_primes = []
             start = 0
-            x = None
+            hidden_states = None
 
             for current_chunk_size in get_range(chunk_sizes):
-                xs_prime, conds_prime = [], []
+                sampled_audio_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
-                    x_prime, cond_prime = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
-                    x = xs[sample_t]
-                    xs_prime.append(x_prime)
+                    x_prime, cond_prime = self.get_emb(sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning)
+                    hidden_states = sampled_audio[sample_t]
+                    sampled_audio_prime.append(x_prime)
                     conds_prime.append(cond_prime)
                 start = start + current_chunk_size
 
-                x_prime, cond_prime = torch.cat(xs_prime, dim=1), torch.cat(conds_prime, dim=1)
-                del xs_prime
+                x_prime, cond_prime = torch.cat(sampled_audio_prime, dim=1), torch.cat(conds_prime, dim=1)
+                del sampled_audio_prime
                 del conds_prime
                 if not get_preds:
                     del cond_prime
-                x_prime = self.transformer(x_prime, encoder_kv=encoder_kv, sample=True, fp16=fp16)
+                x_prime = self.transformer(x_prime, encoder_key_value=encoder_key_value, sample=True, fp16=fp16)
 
                 if get_preds:
                     if self.add_cond_after_transformer:
@@ -1754,45 +1751,45 @@ def primed_sample(
                 preds.append(x_prime)
 
             empty_cache()
-            self.transformer.check_cache(n_samples, len(xs), fp16)
+            self.transformer.check_cache(n_samples, len(sampled_audio), fp16)
 
-            x = xs[-1]
+            hidden_states = sampled_audio[-1]
             empty_cache()
 
-            for sample_t in get_range(range(len(xs), sample_tokens)):
-                x, cond = self.get_emb(sample_t, n_samples, x, x_cond, y_cond)
+            for sample_t in get_range(range(len(sampled_audio), sample_tokens)):
+                hidden_states, cond = self.get_emb(sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
-                x = self.transformer(x, encoder_kv=encoder_kv, sample=True, fp16=fp16)  # Transformer
+                hidden_states = self.transformer(hidden_states, encoder_key_value=encoder_key_value, sample=True, fp16=fp16)  # Transformer
                 if self.add_cond_after_transformer:
-                    x = x + cond
-                x = self.x_out(x)  # Predictions
+                    hidden_states = hidden_states + cond
+                hidden_states = self.x_out(hidden_states)  # Predictions
                 if get_preds:
-                    preds.append(x)
+                    preds.append(hidden_states)
                 # Adjust logits
-                x = x / temp
-                x = filter_logits(x, top_k=top_k, top_p=top_p)
-                x = torch.distributions.Categorical(logits=x).sample()  # Sample and replace x
-                assert x.shape == (n_samples, 1)
-                xs.append(x.clone())
+                hidden_states = hidden_states / temp
+                hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
+                hidden_states = torch.distributions.Categorical(logits=hidden_states).sample()  # Sample and replace hidden_states
+                assert hidden_states.shape == (n_samples, 1)
+                sampled_audio.append(hidden_states.clone())
 
-            del x
+            del hidden_states
             self.transformer.del_cache()
 
-            x = torch.cat(xs, dim=1)
+            hidden_states = torch.cat(sampled_audio, dim=1)
             if get_preds:
                 preds = torch.cat(preds, dim=1)
-            x = self.postprocess(x, sample_tokens)
+            hidden_states = self.postprocess(hidden_states, sample_tokens)
         if get_preds:
-            return x, preds
+            return hidden_states, preds
         else:
-            return x
+            return hidden_states
 
 
 def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
     """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
     Args:
         logits: logits distribution shape (vocabulary size)
-        top_k >0: keep only top k tokens with highest probability (top-k filtering).
+        top_k >0: keep only top key tokens with highest probability (top-k filtering).
         top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
     """
     # assert logits.dim() == 2  # batch size 1 for now - could be updated for more but the code would be less clear
@@ -1828,8 +1825,8 @@ def get_normal(*shape, std=0.01):
     return w
 
 
-def roll(x, n):
-    return torch.cat((x[:, -n:], x[:, :-n]), dim=1)
+def roll(hidden_states, n):
+    return torch.cat((hidden_states[:, -n:], hidden_states[:, :-n]), dim=1)
 
 
 def split_chunks(length, chunk_size):
@@ -1885,7 +1882,7 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
         """
         if raw_audio_conditionning is None:
             raw_audio_conditionning = 0.0
-        # Embed x
+        # Embed hidden_states
         music_tokens = music_tokens.long()
         hidden_states = self.x_emb(music_tokens)
         hidden_states = hidden_states + raw_audio_conditionning
@@ -2004,8 +2001,8 @@ def __init__(
                 n_time, t_bins, relative_pos_range, out_width, init_scale, clamp=True
             )
 
-    def forward(self, y):
-        total_length, offset, length, artist, genre = y[:, 0:1], y[:, 1:2], y[:, 2:3], y[:, 3:4], y[:, 4:]
+    def forward(self, metadata):
+        total_length, offset, length, artist, genre = metadata[:, 0:1], metadata[:, 1:2], metadata[:, 2:3], metadata[:, 3:4], metadata[:, 4:]
         # Start embedding of length 1
         artist_emb = self.artist_emb(artist)
         # Empty genre slots are denoted by -1. We mask these out.
@@ -2031,7 +2028,7 @@ def forward(self, y):
 # TODO rename every conditioning
 class JukeboxPrior(nn.Module):
     """
-    Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
+    Model the prior on vquery codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
     on the timing, genre and artist, we use the LabelConditioner class To condition on the codes from the level above,
     we use the MusicTokenConditioner class To condition on lyrics, we allow two types of priors:
     - Separate Encoder Decoder: This is the usual encoder-decoder style transformer. The encoder transformer
@@ -2039,18 +2036,18 @@ class JukeboxPrior(nn.Module):
     models the lyrics, and we use its last layer to produce keys/values that are attened to by the decoder transformer
     - Single Encoder Decoder: This is a simplification where we combine them into a single model. We merge the text
       vocab
-    and VQ vocab into a single large vocab, and the lyric tokens and VQ tokens into a single longer sequence of tokens
+    and Vquery vocab into a single large vocab, and the lyric tokens and Vquery tokens into a single longer sequence of tokens
     which we autoregressively model together.
     """
 
     def __init__(self, config, level):
         super().__init__()
-        vqvae_z_shapes = config.vqvae_z_shapes
+        vqvae_music_tokens_shapes = config.vqvae_music_tokens_shapes
 
-        def rescale(z_shape):
-            return (z_shape[0] * config.n_ctx[-level - 1] // vqvae_z_shapes[level][0],)
+        def rescale(music_tokens_shape):
+            return (music_tokens_shape[0] * config.n_ctx[-level - 1] // vqvae_music_tokens_shapes[level][0],)
 
-        z_shapes = [rescale(z_shape) for z_shape in vqvae_z_shapes]
+        music_tokens_shapes = [rescale(music_tokens_shape) for music_tokens_shape in vqvae_music_tokens_shapes]
         self.use_tokens = config.use_tokens[-level - 1]
         self.n_tokens = config.n_tokens[-level - 1]
         self.prime_loss_fraction = config.prime_loss_fraction[-level - 1]
@@ -2059,10 +2056,10 @@ def rescale(z_shape):
         if self.copy_input:
             config.bins = config.l_bins
 
-        self.z_shapes = z_shapes
-        self.levels = len(self.z_shapes)
+        self.music_tokens_shapes = music_tokens_shapes
+        self.levels = len(self.music_tokens_shapes)
 
-        self.z_shape = self.z_shapes[level]
+        self.music_tokens_shape = self.music_tokens_shapes[level]
 
         self.level = level
 
@@ -2073,7 +2070,7 @@ def rescale(z_shape):
             bins=config.l_bins,
             width=config.width[-level - 1],
             depth=config.depth[-level - 1],
-            heads=config.num_headss[-level - 1],
+            heads=config.n_heads[-level - 1], # TODO Rename in config
             attn_order=config.attn_order[-level - 1],
             blocks=config.blocks,
             spread=config.spread,
@@ -2109,7 +2106,7 @@ def rescale(z_shape):
         else:
             prime_kwargs = dict(bins=config.n_vocab)
 
-        x_cond_kwargs = dict(
+        audio_conditioning_kwargs = dict(
             out_width=config.width[-level - 1],
             init_scale=config.init_scale[-level - 1],
             width=config.cond_width[-level - 1],
@@ -2122,7 +2119,7 @@ def rescale(z_shape):
             checkpoint_res=config.cond_c_res[-level - 1],
         )  # have to keep this else names wrong
 
-        y_cond_kwargs = dict(
+        metadata_conditioning_kwargs = dict(
             out_width=config.width[-level - 1],
             init_scale=config.init_scale[-level - 1],
             y_bins=config.y_bins[-level - 1],
@@ -2134,33 +2131,33 @@ def rescale(z_shape):
         )
 
         # X conditioning
-        self.x_cond = level != (self.levels - 1)
+        self.audio_conditioning = level != (self.levels - 1)
         self.cond_level = level + 1
 
-        # Y conditioning
-        self.y_cond = config.labels
+        # metadata conditioning
+        self.metadata_conditioning = config.labels
 
         self.single_enc_dec = config.single_enc_dec[-level - 1]
         # X conditioning : conditioning on music tokens (either from audio or from previous levels or both)
-        if self.x_cond:
+        if self.audio_conditioning:
             self.conditioner_blocks = nn.ModuleList()
 
             def conditioner_block(_level):
                 return MusicTokenConditioner(
-                    input_shape=z_shapes[_level],
+                    input_shape=music_tokens_shapes[_level],
                     bins=config.l_bins,
                     down_t=config.downs_t[_level],
                     stride_t=config.strides_t[_level],
-                    **x_cond_kwargs,
+                    **audio_conditioning_kwargs,
                 )
 
             # if dist.get_rank() == 0: print(f"Conditioning on 1 above level(s)")
             self.conditioner_blocks.append(conditioner_block(self.cond_level))
 
-        # Y conditioning : contioning on timing, genres, and artist
-        if self.y_cond:
-            self.n_time = self.z_shape[0]  # Assuming STFT=TF order and raw=T1 order, so T is first dim
-            self.y_emb = LabelConditioner(n_time=self.n_time, include_time_signal=not self.x_cond, **y_cond_kwargs)
+        # metadata conditioning : contioning on timing, genres, and artist
+        if self.metadata_conditioning:
+            self.n_time = self.music_tokens_shape[0]  # Assuming STFT=TF order and raw=T1 order, so T is first dim
+            self.metadata_embedding = LabelConditioner(n_time=self.n_time, include_time_signal=not self.audio_conditioning, **metadata_conditioning_kwargs)
 
         # Lyric conditioning
         if config.single_enc_dec[-level - 1]:
@@ -2182,8 +2179,8 @@ def conditioner_block(_level):
             self.prior = JukeboxConditionalAutoregressive(
                 input_shape=(sum(self.prior_dims),),
                 bins=sum(self.prior_bins),
-                x_cond=(self.x_cond or self.y_cond),
-                y_cond=True,
+                audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
+                metadata_conditioning=True,
                 prime_len=self.prime_loss_dims,
                 **prior_kwargs,
             )
@@ -2195,7 +2192,7 @@ def conditioner_block(_level):
                 self.prime_loss_dims = np.prod(prime_input_shape)
                 self.prime_acts_width, self.prime_state_width = prime_kwargs["width"], prior_kwargs["width"]
                 self.prime_prior = JukeboxConditionalAutoregressive(
-                    input_shape=prime_input_shape, x_cond=False, y_cond=False, only_encode=True, **prime_kwargs
+                    input_shape=prime_input_shape, audio_conditioning=False, metadata_conditioning=False, only_encode=True, **prime_kwargs
                 )
                 self.prime_state_proj = JukeboxConv1D(self.prime_acts_width, self.prime_state_width)
                 self.prime_state_ln = JukeboxLayerNorm(self.prime_state_width)
@@ -2204,11 +2201,11 @@ def conditioner_block(_level):
                 nn.init.normal_(self.prime_x_out.weight, std=0.02 * prior_kwargs["init_scale"])
             else:
                 self.prime_loss_dims = 0
-            self.gen_loss_dims = np.prod(self.z_shape)
+            self.gen_loss_dims = np.prod(self.music_tokens_shape)
             self.total_loss_dims = self.prime_loss_dims + self.gen_loss_dims
             self.prior = JukeboxConditionalAutoregressive(
-                x_cond=(self.x_cond or self.y_cond),
-                y_cond=self.y_cond,
+                audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
+                metadata_conditioning=self.metadata_conditioning,
                 encoder_dims=self.prime_loss_dims,
                 merged_decoder=config.merged_decoder[-level - 1],
                 **prior_kwargs,
@@ -2225,93 +2222,93 @@ def conditioner_block(_level):
             f" length:{self.sample_length}"
         )
 
-    def get_y(self, labels, start, total_length, offset, get_indices=False):
-        y = labels.clone()
-        y[:, 0] = total_length
+    def get_metadata(self, labels, start, total_length, offset, get_indices=False):
+        metadata = labels.clone()
+        metadata[:, 0] = total_length
         # Set sample_length to match this level
-        y[:, 2] = int(self.sample_length)
+        metadata[:, 2] = int(self.sample_length)
 
         # Set offset
-        y[:, 1:2] = int(offset * self.raw_to_tokens) + int(start * self.raw_to_tokens)
-        # here since y has the full token_list, ze just need to selected the ones that are relevant
+        metadata[:, 1:2] = int(offset * self.raw_to_tokens) + int(start * self.raw_to_tokens)
+        # here since metadata has the full token_list, ze just need to selected the ones that are relevant
 
         # Set lyric tokens
-        y, indices = self.set_y_lyric_tokens(y)
+        metadata, indices = self.set_metadata_lyric_tokens(metadata)
         if get_indices:
-            return y, indices
+            return metadata, indices
         else:
-            return y
+            return metadata
 
-    def set_y_lyric_tokens(self, labels):
-        # assert ys.shape[0] == len(labels)
+    def set_metadata_lyric_tokens(self, labels):
+        # assert metadatas.shape[0] == len(labels)
         if self.n_tokens > 0:
             # total_length, offset, duration):
             tokens_list = torch.zeros((labels.shape[0], self.n_tokens), dtype=torch.long, device=labels.device)
             indices_list = []  # whats the index of each current character in original array
             for i in range(labels.shape[0]):
-                full_tokens = labels.clone()[:, 4 + self.y_emb.max_bow_genre_size :]
+                full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_bow_genre_size :]
                 total_length, offset, duration = labels[i, 0], labels[i, 1], labels[i, 2]
                 tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
                 tokens_list[i, :] = tokens
                 indices_list.append(indices)
 
-            return torch.cat((labels[:, : 4 + self.y_emb.max_bow_genre_size], tokens_list), dim=-1), indices_list
+            return torch.cat((labels[:, : 4 + self.metadata_embedding.max_bow_genre_size], tokens_list), dim=-1), indices_list
         else:
             return labels, None
 
-    def get_z_conds(self, music_tokens, start, end):
+    def get_music_tokens_conds(self, music_tokens, start, end):
         if self.level != self.levels - 1:
             assert start % self.cond_downsample == end % self.cond_downsample == 0
-            z_cond = music_tokens[self.level + 1][:, start // self.cond_downsample : end // self.cond_downsample]
-            assert z_cond.shape[1] == self.n_ctx // self.cond_downsample
-            z_conds = [z_cond]
+            music_tokens_cond = music_tokens[self.level + 1][:, start // self.cond_downsample : end // self.cond_downsample]
+            assert music_tokens_cond.shape[1] == self.n_ctx // self.cond_downsample
+            music_tokens_conds = [music_tokens_cond]
         else:
-            z_conds = None
-        return z_conds
+            music_tokens_conds = None
+        return music_tokens_conds
 
-    def prior_preprocess(self, xs, conds):
-        N = xs[0].shape[0]
-        for i in range(len(xs)):
-            xs[i] = (xs[i] + int(self.prior_bins_shift[i])).view(N, -1)
+    def prior_preprocess(self, sampled_audio, conds):
+        N = sampled_audio[0].shape[0]
+        for i in range(len(sampled_audio)):
+            sampled_audio[i] = (sampled_audio[i] + int(self.prior_bins_shift[i])).view(N, -1)
 
         for i in range(len(conds)):
             cond, _, dims = conds[i], self.prior_shapes[i], self.prior_dims[i]
             if cond is None:
-                conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=xs[0].device)
+                conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=sampled_audio[0].device)
 
-        return torch.cat(xs, dim=1), torch.cat(conds, dim=1)
+        return torch.cat(sampled_audio, dim=1), torch.cat(conds, dim=1)
 
-    def prior_postprocess(self, z):
-        N = z.shape[0]
-        dims = (self.prior_dims[0], z.shape[1] - self.prior_dims[0])
-        xs = list(torch.split(z, dims, dim=1))
+    def prior_postprocess(self, music_tokens):
+        N = music_tokens.shape[0]
+        dims = (self.prior_dims[0], music_tokens.shape[1] - self.prior_dims[0])
+        sampled_audio = list(torch.split(music_tokens, dims, dim=1))
 
-        for i in range(len(xs)):
+        for i in range(len(sampled_audio)):
             shape = self.prior_shapes[i]
             _, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])  # bins, -> _,
-            xs[i] = (xs[i] - bins_shift).view(N, -1, *shape[1:])
-            xs[i] = torch.clamp(
-                xs[i], min=0
+            sampled_audio[i] = (sampled_audio[i] - bins_shift).view(N, -1, *shape[1:])
+            sampled_audio[i] = torch.clamp(
+                sampled_audio[i], min=0
             )  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
 
-        return xs[-1]
+        return sampled_audio[-1]
 
-    def x_emb(self, z_conds):
-        z_conds = z_conds[: self.cond_level - self.level]
-        x_cond = None
-        for z_cond, conditioner_block in reversed(list(zip(z_conds, self.conditioner_blocks))):
-            x_cond = conditioner_block(z_cond, x_cond)
-        return x_cond
+    def x_emb(self, music_tokens_conds):
+        music_tokens_conds = music_tokens_conds[: self.cond_level - self.level]
+        audio_conditioning = None
+        for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, self.conditioner_blocks))):
+            audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning)
+        return audio_conditioning
 
     # should be removed as the vq-vae is no longer part of the prior
-    def encode(self, x, start_level=None, end_level=None, bs_chunks=1):
+    def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
         if start_level is None:
             start_level = self.level
         if end_level is None:
             end_level = self.levels
         # Get latents
         with torch.no_grad():
-            music_tokens = self.encoder(x, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+            music_tokens = self.encoder(hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
         return music_tokens
 
     # same as above, the va-vae is no longer part of the prior
@@ -2324,22 +2321,22 @@ def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
             x_out = self.decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
         return x_out
 
-    def get_cond(self, z_conds, y):
-        if y is not None:
-            n_labels = y.shape[1] - self.n_tokens
-            y, prime = y[:, :n_labels], y[:, n_labels:]
+    def get_cond(self, music_tokens_conds, metadata):
+        if metadata is not None:
+            n_labels = metadata.shape[1] - self.n_tokens
+            metadata, prime = metadata[:, :n_labels], metadata[:, n_labels:]
         else:
-            y, prime = None, None
-        y_cond, y_pos = self.y_emb(y) if self.y_cond else (None, None)
-        x_cond = self.x_emb(z_conds) if self.x_cond else y_pos
-        return x_cond, y_cond, prime
+            metadata, prime = None, None
+        metadata_conditioning, metadata_pos = self.metadata_embedding(metadata) if self.metadata_conditioning else (None, None)
+        audio_conditioning = self.x_emb(music_tokens_conds) if self.audio_conditioning else metadata_pos
+        return audio_conditioning, metadata_conditioning, prime
 
     def sample(
         self,
         n_samples,
-        z=None,
-        z_conds=None,
-        y=None,
+        music_tokens=None,
+        music_tokens_conds=None,
+        metadata=None,
         fp16=False,
         temp=1.0,
         top_k=0,
@@ -2347,25 +2344,25 @@ def sample(
         chunk_size=None,
         sample_tokens=None,
     ):
-        no_past_context = z is None or z.shape[1] == 0
+        no_past_context = music_tokens is None or music_tokens.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
         print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
 
         with torch.no_grad():
-            # Currently x_cond only uses immediately above layer
-            x_cond, y_cond, prime = self.get_cond(z_conds, y)
+            # Currently audio_conditioning only uses immediately above layer
+            audio_conditioning, metadata_conditioning, prime = self.get_cond(music_tokens_conds, metadata)
             if self.single_enc_dec:
                 if no_past_context:
-                    z, x_cond = self.prior_preprocess([prime], [None, x_cond])
+                    music_tokens, audio_conditioning = self.prior_preprocess([prime], [None, audio_conditioning])
                 else:
-                    z, x_cond = self.prior_preprocess([prime, z], [None, x_cond])
+                    music_tokens, audio_conditioning = self.prior_preprocess([prime, music_tokens], [None, audio_conditioning])
                 if sample_tokens is not None:
                     sample_tokens += self.n_tokens
-                z = self.prior.primed_sample(
+                music_tokens = self.prior.primed_sample(
                     n_samples,
-                    z,
-                    x_cond,
-                    y_cond,
+                    music_tokens,
+                    audio_conditioning,
+                    metadata_conditioning,
                     fp16=fp16,
                     temp=temp,
                     top_k=top_k,
@@ -2373,15 +2370,15 @@ def sample(
                     chunk_size=chunk_size,
                     sample_tokens=sample_tokens,
                 )
-                z = self.prior_postprocess(z)
+                music_tokens = self.prior_postprocess(music_tokens)
             else:
-                encoder_kv = self.get_encoder_kv(prime, fp16=fp16, sample=True)
+                encoder_key_value = self.get_encoder_key_value(prime, fp16=fp16, sample=True)
                 if no_past_context:
-                    z = self.prior.sample(
+                    music_tokens = self.prior.sample(
                         n_samples,
-                        x_cond,
-                        y_cond,
-                        encoder_kv,
+                        audio_conditioning,
+                        metadata_conditioning,
+                        encoder_key_value,
                         fp16=fp16,
                         temp=temp,
                         top_k=top_k,
@@ -2389,12 +2386,12 @@ def sample(
                         sample_tokens=sample_tokens,
                     )
                 else:
-                    z = self.prior.primed_sample(
+                    music_tokens = self.prior.primed_sample(
                         n_samples,
-                        z,
-                        x_cond,
-                        y_cond,
-                        encoder_kv,
+                        music_tokens,
+                        audio_conditioning,
+                        metadata_conditioning,
+                        encoder_key_value,
                         fp16=fp16,
                         temp=temp,
                         top_k=top_k,
@@ -2402,34 +2399,34 @@ def sample(
                         chunk_size=chunk_size,
                         sample_tokens=sample_tokens,
                     )
-        return z
+        return music_tokens
 
-    def get_encoder_kv(self, prime, fp16=False, sample=False):
+    def get_encoder_key_value(self, prime, fp16=False, sample=False):
         if self.n_tokens != 0 and self.use_tokens:
             if sample:
                 self.prime_prior = self.prime_prior.to(prime.device)
             prime_acts = self.prime_prior(prime, None, None, None, fp16=fp16)
-            encoder_kv = self.prime_state_ln(self.prime_state_proj(prime_acts))
+            encoder_key_value = self.prime_state_ln(self.prime_state_proj(prime_acts))
             if sample:
                 self.prime_prior.cpu()
                 if fp16:
-                    encoder_kv = encoder_kv.half()
+                    encoder_key_value = encoder_key_value.half()
         else:
-            encoder_kv = None
-        return encoder_kv
+            encoder_key_value = None
+        return encoder_key_value
 
-    def get_prime_loss(self, encoder_kv, prime_t):
+    def get_prime_loss(self, encoder_key_value, prime_t):
         if self.use_tokens:
-            encoder_kv = encoder_kv.float()
-            encoder_kv = self.prime_x_out(encoder_kv)
-            prime_loss = nn.functional.cross_entropy(encoder_kv.view(-1, self.prime_bins), prime_t.view(-1)) / np.log(
+            encoder_key_value = encoder_key_value.float()
+            encoder_key_value = self.prime_x_out(encoder_key_value)
+            prime_loss = nn.functional.cross_entropy(encoder_key_value.view(-1, self.prime_bins), prime_t.view(-1)) / np.log(
                 2.0
             )
         else:
             prime_loss = torch.tensor(0.0, device="cuda")
         return prime_loss
 
-    def z_forward(self, z, z_conds=[], y=None, fp16=False, get_preds=False, get_attn_weights=False):
+    def music_tokens_forward(self, music_tokens, music_tokens_conds=[], metadata=None, fp16=False, get_preds=False, get_attn_weights=False):
         """
         Arguments:
             get_attn_weights (bool or set): Makes forward prop dump
@@ -2438,18 +2435,18 @@ def z_forward(self, z, z_conds=[], y=None, fp16=False, get_preds=False, get_attn
         """
         if get_attn_weights:
             self.prior.transformer.set_record_attn(get_attn_weights)
-        x_cond, y_cond, prime = self.get_cond(z_conds, y)
+        audio_conditioning, metadata_conditioning, prime = self.get_cond(music_tokens_conds, metadata)
         if self.copy_input:
-            prime = z[:, : self.n_tokens]
+            prime = music_tokens[:, : self.n_tokens]
         if self.single_enc_dec:
-            z, x_cond = self.prior_preprocess([prime, z], [None, x_cond])
+            music_tokens, audio_conditioning = self.prior_preprocess([prime, music_tokens], [None, audio_conditioning])
             (prime_loss, gen_loss), preds = self.prior(
-                z, x_cond, y_cond, fp16=fp16, get_sep_loss=True, get_preds=get_preds
+                music_tokens, audio_conditioning, metadata_conditioning, fp16=fp16, get_sep_loss=True, get_preds=get_preds
             )
         else:
-            encoder_kv = self.get_encoder_kv(prime, fp16=fp16)
-            prime_loss = self.get_prime_loss(encoder_kv, prime)
-            gen_loss, preds = self.prior(z, x_cond, y_cond, encoder_kv, fp16=fp16, get_preds=get_preds)
+            encoder_key_value = self.get_encoder_key_value(prime, fp16=fp16)
+            prime_loss = self.get_prime_loss(encoder_key_value, prime)
+            gen_loss, preds = self.prior(music_tokens, audio_conditioning, metadata_conditioning, encoder_key_value, fp16=fp16, get_preds=get_preds)
         loss = (self.prime_loss_fraction * prime_loss * self.prime_loss_dims / self.total_loss_dims) + (
             gen_loss * self.gen_loss_dims / self.total_loss_dims
         )
@@ -2465,15 +2462,15 @@ def z_forward(self, z, z_conds=[], y=None, fp16=False, get_preds=False, get_attn
         else:
             return loss, metrics
 
-    def forward(self, x, y=None, fp16=False, decode=False, get_preds=False):
-        bs = x.shape[0]
-        z, *z_conds = self.encode(x, bs_chunks=bs)
-        loss, metrics = self.z_forward(z=z, z_conds=z_conds, y=y, fp16=fp16, get_preds=get_preds)
+    def forward(self, hidden_states, metadata=None, fp16=False, decode=False, get_preds=False):
+        batch_size = hidden_states.shape[0]
+        music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size)
+        loss, metrics = self.music_tokens_forward(music_tokens=music_tokens, music_tokens_conds=music_tokens_conds, metadata= metadata, fp16=fp16, get_preds=get_preds)
         if decode:
-            x_out = self.decode([z, *z_conds])
+            dequantised_states = self.decode([music_tokens, *music_tokens_conds])
         else:
-            x_out = None
-        return x_out, loss, metrics
+            dequantised_states = None
+        return dequantised_states, loss, metrics
 
 
 class JukeboxPreTrainedModel(PreTrainedModel):
@@ -2545,11 +2542,11 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
     level = level - 1  # Top level used
     n_ctx = prior.n_ctx
     tokens = music_tokens[level]
-    bs, total_length = tokens.shape[0], tokens.shape[1]
+    batch_size, total_length = tokens.shape[0], tokens.shape[1]
     if total_length < n_ctx:
         padding_length = n_ctx - total_length
         tokens = torch.cat(
-            [tokens, torch.zeros(bs, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1
+            [tokens, torch.zeros(batch_size, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1
         )
         total_length = tokens.shape[1]
     else:
@@ -2565,14 +2562,14 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
     for start in get_starts(total_length, n_ctx, hop_length):
         end = start + n_ctx
 
-        # set y offset, sample_length and lyrics tokens
-        y, indices_hop = prior.get_y(labels, start, hps.sample_length, get_indices=True, offset=0)
+        # set metadata offset, sample_length and lyrics tokens
+        metadata, indices_hop = prior.get_metadata(labels, start, hps.sample_length, get_indices=True, offset=0)
 
-        tokens_bs = torch.chunk(tokens, bs, dim=0)
-        y_bs = torch.chunk(y, bs, dim=0)
+        tokens_bs = torch.chunk(tokens, batch_size, dim=0)
+        metadata_bs = torch.chunk(metadata, batch_size, dim=0)
         w_hops = []
-        for tokens_i, y_i in zip(tokens_bs, y_bs):
-            w_hop = prior.z_forward(tokens_i[:, start:end], [], y_i, fp16=fp16, get_attn_weights=attn_layers)
+        for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
+            w_hop = prior.music_tokens_forward(tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers)
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
         w = torch.cat(w_hops, dim=0)
@@ -2590,7 +2587,7 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
     # Combine attn for each hop into attn for full range
     # Use indices to place them into correct place for corresponding source tokens
     alignments = []
-    for item in range(bs):
+    for item in range(batch_size):
         # Note each item has different length lyrics
         full_tokens = labels[:, 3:]
         alignment = np.zeros((total_length, len(full_tokens) + 1))
@@ -2654,7 +2651,7 @@ def __init__(self, config):
         super().__init__(config)
         self.embed_dim = config.hidden_size
         self.vqvae = JukeboxVQVAE(config)
-        config.vqvae_z_shapes = self.vqvae.z_shapes
+        config.vqvae_music_tokens_shapes = self.vqvae.music_tokens_shapes
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
     # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
@@ -2678,7 +2675,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         n_samples = self.config.n_samples
         n_ctx = prior.n_ctx
         end = start + n_ctx
-        # get z already sampled at current level
+        # get music_tokens already sampled at current level
         previous_sampled_tokens = music_tokens[level][:, start:end]
 
         if "sample_tokens" in sampling_kwargs:
@@ -2703,31 +2700,31 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
             # Nothing new to sample
             return music_tokens
 
-        # get z_conds from level above TODO rename to latent_cond? Or music_token_cond
-        z_conds = prior.get_z_conds(music_tokens, start, end)
+        # get music_tokens_conds from level above TODO rename to latent_cond? Or music_token_cond
+        music_tokens_conds = prior.get_music_tokens_conds(music_tokens, start, end)
         # if there are no levels above should return None!
 
-        # set y offset, sample_length and lyrics okens TODO rename to lyric_cond
-        y = prior.get_y(labels, start, self.config.sample_length, offset)
+        # set metadata offset, sample_length and lyrics okens TODO rename to lyric_cond
+        metadata = prior.get_metadata(labels, start, self.config.sample_length, offset)
 
         empty_cache()
         max_batch_size = sampling_kwargs["max_batch_size"]
         del sampling_kwargs["max_batch_size"]
 
-        z_list = split_batch(previous_sampled_tokens, n_samples, max_batch_size)
-        z_conds_list = split_batch(z_conds, n_samples, max_batch_size)
-        y_list = split_batch(y, n_samples, max_batch_size)
-        z_samples = []
-        for z_i, z_conds_i, y_i in zip(z_list, z_conds_list, y_list):
-            z_samples_i = prior.sample(n_samples=z_i.shape[0], z=z_i, z_conds=z_conds_i, y=y_i, **sampling_kwargs)
-            z_samples.append(z_samples_i)
-        z = torch.cat(z_samples, dim=0)
+        music_tokens_list = split_batch(previous_sampled_tokens, n_samples, max_batch_size)
+        music_tokens_conds_list = split_batch(music_tokens_conds, n_samples, max_batch_size)
+        metadata_list = split_batch(metadata, n_samples, max_batch_size)
+        tokens = []
+        for music_tokens_i, music_tokens_conds_i, metadata_i in zip(music_tokens_list, music_tokens_conds_list, metadata_list):
+            tokens_i = prior.sample(n_samples=music_tokens_i.shape[0], music_tokens=music_tokens_i, music_tokens_conds=music_tokens_conds_i, metadata= metadata_i, **sampling_kwargs)
+            tokens.append(tokens_i)
+        sampled_tokens = torch.cat(tokens, dim=0)
 
         sampling_kwargs["max_batch_size"] = max_batch_size
 
-        # Update z with new sample
-        z_new = z[:, -new_tokens:]
-        music_tokens[level] = torch.cat([music_tokens[level], z_new], dim=1)
+        # Update music_tokens with new sample
+        music_tokens_new = sampled_tokens[:, -new_tokens:]
+        music_tokens[level] = torch.cat([music_tokens[level], music_tokens_new], dim=1)
         return music_tokens
 
     # Sample total_length tokens at level=level with hop_length=hop_length

From 55e83cba4be3cc0cf22a9b52719270ff9a42110d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 8 Aug 2022 16:11:25 +0000
Subject: [PATCH 081/196] finish weight renaming

---
 .../models/jukebox/modeling_jukebox.py        | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index e87fd9573e057..6f38c72cf062c 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -64,9 +64,9 @@ def empty_cache():
     torch.cuda.empty_cache()
 
 
-def get_range(hidden_states):
+def get_range(list):
     return tqdm(
-        hidden_states, leave=True, file=sys.stdout, bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
+        list, leave=True, file=sys.stdout, bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
     )
 
 
@@ -478,7 +478,7 @@ def forward(self, hidden_states, update_codebook=True):
         # Preprocess
         hidden_states, prenorm = self.preprocess(hidden_states)
 
-        # Init key if not inited
+        # Init codebook if not inited
         if update_codebook and not self.init:
             self.init_codebook(hidden_states)
 
@@ -1241,7 +1241,7 @@ def __init__(
             prime_len=prime_len,
         )
 
-        self.ln_0 = JukeboxLayerNorm(width)
+        self.layer_norm_0 = JukeboxLayerNorm(width)
         self.mlp = JukeboxMLP(
             width=width,
             n_state=int(m_mlp * width),
@@ -1250,7 +1250,7 @@ def __init__(
             zero_out=zero_out,
             init_scale=init_scale,
         )
-        self.ln_1 = JukeboxLayerNorm(width)
+        self.layer_norm_1 = JukeboxLayerNorm(width)
         self.res_scale = res_scale
 
         # TODO either support checkpointing for faster inference or get rid of this
@@ -1262,10 +1262,10 @@ def __init__(
 
     def forward(self, hidden_states, encoder_key_value, sample=False):
         residuals = hidden_states
-        hidden_states = self.ln_0(hidden_states)
+        hidden_states = self.layer_norm_0(hidden_states)
         hidden_states = self.attn(hidden_states, encoder_key_value, sample)
 
-        output_states = self.ln_1(residuals + hidden_states)
+        output_states = self.layer_norm_1(residuals + hidden_states)
         output_states = self.mlp(output_states)
         if self.res_scale == 1.0:
             output = residuals + hidden_states + output_states
@@ -1472,7 +1472,7 @@ def __init__(
         self.width = width
         self.depth = depth
 
-        # TODO  rename hidden_states to proper name
+        # TODO  rename x_emb to proper name, as well as x_out 
         self.x_emb = nn.Embedding(bins, width)
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
         self.x_emb_dropout = nn.Dropout(emb_dropout)
@@ -1858,11 +1858,11 @@ def __init__(
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
 
         # MusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level?
-        self.cond = DecoderConvBock(
+        self.upsampler = DecoderConvBock(
             self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale
         )
-        # TODO rename all ln to layer_norm
-        self.ln = JukeboxLayerNorm(self.width)
+        # TODO rename all layer_norm to layer_norm
+        self.layer_norm = JukeboxLayerNorm(self.width)
 
     def preprocess(self, hidden_states):
         hidden_states = hidden_states.permute(0, 2, 1)  # NTC -> NCT
@@ -1889,9 +1889,9 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
 
         # Run conditioner
         hidden_states = self.preprocess(hidden_states)
-        hidden_states = self.cond(hidden_states)
+        hidden_states = self.upsampler(hidden_states)
         hidden_states = self.postprocess(hidden_states)
-        hidden_states = self.ln(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
 
@@ -1964,12 +1964,12 @@ def forward(self, pos_start, pos_end=None):
         return self.emb(bins)
 
 
-# TODO rename y_bins and t_bins as well as y
+# TODO rename y_bins and timing_dims as well as y
 class LabelConditioner(nn.Module):
     def __init__(
         self,
-        y_bins,
-        t_bins,
+        metadata_dims,
+        timing_dims,
         sr,
         min_duration,
         max_duration,
@@ -1982,7 +1982,7 @@ def __init__(
         super().__init__()
         self.n_time = n_time
         self.out_width = out_width
-        bow_genre_bins, artist_bins = y_bins
+        bow_genre_bins, artist_bins = metadata_dims
         self.max_bow_genre_size = max_bow_genre_size
         self.bow_genre_emb = SimpleEmbedding(bow_genre_bins, out_width, init_scale)
         self.artist_emb = SimpleEmbedding(artist_bins, out_width, init_scale)
@@ -1995,10 +1995,10 @@ def __init__(
             )  # Relative pos
             assert len(t_ranges) == 3, f"Expecting (total, absolute, relative) ranges, got {t_ranges}"
             total_length_range, absolute_pos_range, relative_pos_range = t_ranges
-            self.total_length_emb = RangeEmbedding(1, t_bins, total_length_range, out_width, init_scale)
-            self.absolute_pos_emb = RangeEmbedding(n_time, t_bins, absolute_pos_range, out_width, init_scale)
+            self.total_length_emb = RangeEmbedding(1, timing_dims, total_length_range, out_width, init_scale)
+            self.absolute_pos_emb = RangeEmbedding(n_time, timing_dims, absolute_pos_range, out_width, init_scale)
             self.relative_pos_emb = RangeEmbedding(
-                n_time, t_bins, relative_pos_range, out_width, init_scale, clamp=True
+                n_time, timing_dims, relative_pos_range, out_width, init_scale, clamp=True
             )
 
     def forward(self, metadata):
@@ -2122,8 +2122,8 @@ def rescale(music_tokens_shape):
         metadata_conditioning_kwargs = dict(
             out_width=config.width[-level - 1],
             init_scale=config.init_scale[-level - 1],
-            y_bins=config.y_bins[-level - 1],
-            t_bins=config.t_bins,
+            metadata_dims=config.y_bins[-level - 1], # rename to metadata_bins
+            timing_dims=config.t_bins, # rename to timing_dims
             sr=config.sr,
             min_duration=config.min_duration,
             max_duration=config.max_duration,
@@ -2195,7 +2195,7 @@ def conditioner_block(_level):
                     input_shape=prime_input_shape, audio_conditioning=False, metadata_conditioning=False, only_encode=True, **prime_kwargs
                 )
                 self.prime_state_proj = JukeboxConv1D(self.prime_acts_width, self.prime_state_width)
-                self.prime_state_ln = JukeboxLayerNorm(self.prime_state_width)
+                self.prime_state_layer_norm = JukeboxLayerNorm(self.prime_state_width)
                 self.prime_bins = prime_kwargs["bins"]
                 self.prime_x_out = nn.Linear(self.prime_state_width, self.prime_bins, bias=False)
                 nn.init.normal_(self.prime_x_out.weight, std=0.02 * prior_kwargs["init_scale"])
@@ -2406,7 +2406,7 @@ def get_encoder_key_value(self, prime, fp16=False, sample=False):
             if sample:
                 self.prime_prior = self.prime_prior.to(prime.device)
             prime_acts = self.prime_prior(prime, None, None, None, fp16=fp16)
-            encoder_key_value = self.prime_state_ln(self.prime_state_proj(prime_acts))
+            encoder_key_value = self.prime_state_layer_norm(self.prime_state_proj(prime_acts))
             if sample:
                 self.prime_prior.cpu()
                 if fp16:

From 632258d8c83c3a2ef50c89b9c715ae12ecab6afc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 9 Aug 2022 09:25:03 +0000
Subject: [PATCH 082/196] isolated remaining variables to rename

---
 .../models/jukebox/modeling_jukebox.py        | 270 +++++++++---------
 1 file changed, 137 insertions(+), 133 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 6f38c72cf062c..90748e19465ea 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -310,19 +310,19 @@ def level_block(level, down_t, stride_t):
 
         self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
 
-    def forward(self, sampled_audio, all_levels=True):
-        hidden_states = sampled_audio[-1]
+    def forward(self, hidden_states, all_levels=True):
+        hidden_state = hidden_states[-1]
 
         # 32, 64 ...
         for level in reversed(range(self.levels)):
             level_block = self.level_blocks[level]
-            hidden_states = level_block(hidden_states)
+            hidden_state = level_block(hidden_state)
 
             if level != 0 and all_levels:
-                hidden_states = hidden_states + sampled_audio[level - 1]
+                hidden_state = hidden_state + hidden_states[level - 1]
 
-        hidden_states = self.out(hidden_states)
-        return hidden_states
+        hidden_state = self.out(hidden_state)
+        return hidden_state
 
 
 def dont_update(params):
@@ -368,7 +368,6 @@ def init_codebook(self, hidden_states):
         self.init = True
         # init k_w using random vectors from hidden_states codebook_w (index w?)
         codes = self._tile(hidden_states)
-        # _k_rand = codes[torch.randperm(codes.shape[0])][:codebook_dim]
         self.codebook = codes[torch.randperm(codes.shape[0])][:codebook_dim]
         self.codebook_sum = self.codebook
         self.codebook_elem = torch.ones(codebook_dim, device=self.codebook.device)
@@ -409,7 +408,7 @@ def update_codebook(self, hidden_states, latent_states):
     def preprocess(self, hidden_states):
         # NCT -> NTC -> [NT, C]
         hidden_states = hidden_states.permute(0, 2, 1).contiguous()
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # x_en = (N * L, w), k_j = (w, codebook_dim)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # x_en = (N *L, w), k_j = (w, codebook_dim)
 
         if hidden_states.shape[-1] == self.codebook_width:
             prenorm = torch.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(np.prod(hidden_states.shape))
@@ -438,7 +437,7 @@ def quantise(self, latent_states):
             torch.sum(latent_states**2, dim=-1, keepdim=True)
             - 2 * torch.matmul(latent_states, codebook_weights)
             + torch.sum(codebook_weights**2, dim=0, keepdim=True)
-        )  # (N * L, b)
+        )  # (N *L, b)
         min_distance, music_tokens = torch.min(distance, dim=-1)
         fit = torch.mean(min_distance)
         return music_tokens, fit
@@ -448,7 +447,7 @@ def dequantise(self, music_tokens):
         return dequantised_states
 
     def encode(self, latent_states):
-        samples, _, seq_length = latent_states.shape
+        samples, _, seq_len = latent_states.shape
 
         # Preprocess.
         latent_states, _ = self.preprocess(latent_states)
@@ -457,23 +456,23 @@ def encode(self, latent_states):
         music_tokens, _ = self.quantise(latent_states)
 
         # Postprocess.
-        music_tokens = music_tokens.view(samples, seq_length)
+        music_tokens = music_tokens.view(samples, seq_len)
         return music_tokens
 
     def decode(self, music_tokens):
-        samples, seq_length = music_tokens.shape
+        samples, seq_len = music_tokens.shape
 
         # Dequantise
         dequantised_states = self.dequantise(music_tokens)
 
         # Postprocess
         dequantised_states = (
-            dequantised_states.view(samples, seq_length, self.codebook_width).permute(0, 2, 1).contiguous()
+            dequantised_states.view(samples, seq_len, self.codebook_width).permute(0, 2, 1).contiguous()
         )
         return dequantised_states
 
     def forward(self, hidden_states, update_codebook=True):
-        samples, width, seq_length = hidden_states.shape
+        samples, width, seq_len = hidden_states.shape
 
         # Preprocess
         hidden_states, prenorm = self.preprocess(hidden_states)
@@ -499,7 +498,7 @@ def forward(self, hidden_states, update_codebook=True):
         dequantised_states = hidden_states + (dequantised_states - hidden_states).detach()
 
         # Postprocess
-        music_tokens, dequantised_states = self.postprocess(music_tokens, dequantised_states, (samples, seq_length))
+        music_tokens, dequantised_states = self.postprocess(music_tokens, dequantised_states, (samples, seq_len))
         return music_tokens, dequantised_states, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
 
 
@@ -572,8 +571,6 @@ def __init__(self, config):
         self.strides_t = strides_t = config.vq_vae_strides_t
         self.codebook_dim = codebook_dim = config.vq_vae_codebook_dimension
         self.commit = config.vq_vae_commit
-        self.spectral = config.spectral
-        self.multispectral = config.multispectral
 
         self.sample_length = input_shape[0]
         x_shape, x_channels = input_shape[:-1], input_shape[-1]
@@ -750,28 +747,28 @@ def repeat(hidden_states, n_repeat, dim):
     )
 
 
-def get_mask(mask, q_l, kv_l, blocks, spread, device, sample, sample_t):
-    # returns a mask of shape 1 hidden_states 1 hidden_states q_l hidden_states kv_l or None if masking is not needed.
-    if mask is None or q_l == 1:
+def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t):
+    # returns a mask of shape 1 x 1 x query_length x key_value_length or None if masking is not needed.
+    if mask is None or query_length == 1:
         return None
-    offset = sample_t - q_l if sample else max(kv_l - q_l, 0)
+    offset = sample_t - query_length if sample else max(key_value_length - query_length, 0)
     if mask == "autoregressive":
         # Masked dense
-        mask = torch.ones(q_l, kv_l, device=device).tril(offset)
+        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
     elif mask == "summary":
         # Masked summary
         mask = (
             torch.nn.functional.pad(
-                torch.ones(q_l, q_l, device=device).tril().view(q_l, blocks, q_l // blocks)[:, :-1, -kv_l // blocks :],
+                torch.ones(query_length, query_length, device=device).tril().view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :],
                 (0, 0, 1, 0),
                 value=1,
             )
             .contiguous()
-            .view(q_l, kv_l)
+            .view(query_length, key_value_length)
         )
     elif mask == "prime":
-        mask = torch.ones(q_l, kv_l, device=device).tril(offset)
-    return mask.view(1, 1, q_l, kv_l)
+        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
+    return mask.view(1, 1, query_length, key_value_length)
 
 
 class JukeboxAttention(nn.Module):
@@ -811,7 +808,7 @@ def __init__(
         self.attn_dropout = nn.Dropout(attn_dropout) if attn_dropout > 0.0 else lambda x: x
         self.resid_dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
 
-        # Sequence of length l is factored as [blocks, l // blocks]
+        # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
         self.attn_func = attn_func
         self.qkv, self.attn, self.attn_mask = {
             0: (self.factored_qkv, self.dense_attn, "autoregressive"),  # Attend to all positions
@@ -898,74 +895,73 @@ def dense_attn(self, query, key, value, sample):
         context_states = self.merge_heads(context_states)
         return context_states
 
-    # TODO rename here too
     def block_attn(self, query, key, value, sample):
         _, block_ctx = (
             self.blocks,
             self.block_ctx,
-        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        batch_size, seq_length, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
-            assert seq_length == self._suff_cache_len(), f"{seq_length} != {self._suff_cache_len()}"
+            assert seq_len == self._suff_cache_len(), f"{seq_len} != {self._suff_cache_len()}"
             return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             query_length = query.shape[1]
             query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
-            if query_length < seq_length:
-                seq_length = query_length
-                key = key[:, -seq_length:].contiguous()
-                value = value[:, -seq_length:].contiguous()
-            key = key.view(batch_size * seq_length // block_ctx, block_ctx, embed_dim)
-            value = value.view(batch_size * seq_length // block_ctx, block_ctx, embed_dim)
-            return self.dense_attn(query, key, value, sample).view(batch_size, seq_length, embed_dim)
+            if query_length < seq_len:
+                seq_len = query_length
+                key = key[:, -seq_len:].contiguous()
+                value = value[:, -seq_len:].contiguous()
+            key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def transpose_block_attn(self, query, key, value, sample):
         _, block_ctx = (
             self.blocks,
             self.block_ctx,
-        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        batch_size, l, d = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
-            block_l = (l - 1) % block_ctx
+            block_l = (seq_len - 1) % block_ctx
             key = key[:, block_l::block_ctx, :]
             value = value[:, block_l::block_ctx, :]
-            return self.dense_attn(query, key, value, sample).view(batch_size, 1, d)
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
-            ql = query.shape[1]
+            query_length = query.shape[1]
             query = (
-                query.view(batch_size, ql // block_ctx, block_ctx, d)
+                query.view(batch_size, query_length // block_ctx, block_ctx, embed_dim)
                 .transpose(1, 2)
                 .contiguous()
-                .view(batch_size * block_ctx, ql // block_ctx, d)
+                .view(batch_size * block_ctx, query_length // block_ctx, embed_dim)
             )
             key = (
-                key.view(batch_size, l // block_ctx, block_ctx, d)
+                key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
                 .transpose(1, 2)
                 .contiguous()
-                .view(batch_size * block_ctx, l // block_ctx, d)
+                .view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
             )
             value = (
-                value.view(batch_size, l // block_ctx, block_ctx, d)
+                value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
                 .transpose(1, 2)
                 .contiguous()
-                .view(batch_size * block_ctx, l // block_ctx, d)
+                .view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
             )
             return (
                 self.dense_attn(query, key, value, sample)
-                .view(batch_size, block_ctx, ql // block_ctx, d)
+                .view(batch_size, block_ctx, query_length // block_ctx, embed_dim)
                 .transpose(1, 2)
                 .contiguous()
-                .view(batch_size, ql, d)
+                .view(batch_size, query_length, embed_dim)
             )
 
     def prev_block_attn(self, query, key, value, sample):
         _, block_ctx = (
             self.blocks,
             self.block_ctx,
-        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        batch_size, l, d = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
-            assert l == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
+            assert seq_len == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
             block = (l - 1) // block_ctx
             prev_l = (block - 1) * block_ctx
             if block > 0:
@@ -975,67 +971,67 @@ def prev_block_attn(self, query, key, value, sample):
             else:
                 key = torch.zeros(batch_size, block_ctx, d, device=query.device, dtype=query.dtype)
                 value = torch.zeros(batch_size, block_ctx, d, device=query.device, dtype=query.dtype)
-            return self.dense_attn(query, key, value, sample).view(batch_size, 1, d)
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
-            ql = query.shape[1]
-            query = query.view(batch_size * ql // block_ctx, block_ctx, d)
+            query_length = query.shape[1]
+            query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
             key = torch.nn.functional.pad(
-                key.view(batch_size, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
-            ).view(batch_size * l // block_ctx, block_ctx, d)
+                key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
+            ).view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
             value = torch.nn.functional.pad(
-                value.view(batch_size, l // block_ctx, block_ctx, d)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
-            ).view(batch_size * l // block_ctx, block_ctx, d)
-            if ql < l:
-                qb = ql // block_ctx
-                kb = l // block_ctx
-                l = ql
-                key = key.view(batch_size, kb, block_ctx, d)[:, -qb:].contiguous().view(batch_size * qb, block_ctx, d)
-                value = value.view(batch_size, kb, block_ctx, d)[:, -qb:].contiguous().view(batch_size * qb, block_ctx, d)
-            return self.dense_attn(query, key, value, sample).view(batch_size, l, d)
+                value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
+            ).view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            if query_length < seq_len:
+                qb = query_length // block_ctx
+                kb = seq_len // block_ctx
+                seq_len = query_length
+                key = key.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:].contiguous().view(batch_size * qb, block_ctx, embed_dim)
+                value = value.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:].contiguous().view(batch_size * qb, block_ctx, embed_dim)
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def summary_attn(self, query, key, value, sample):
         blocks, block_ctx = (
             self.blocks,
             self.block_ctx,
-        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        batch_size, l, d = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
             key = torch.nn.functional.pad(key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
             value = torch.nn.functional.pad(value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
-            return self.dense_attn(query, key, value, sample).view(batch_size, 1, d)
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             key = torch.nn.functional.pad(
-                key.view(batch_size, blocks, l // blocks, d)[:, :-1, -1, :], (0, 0, 1, 0)
-            )  # batch_size, blocks, d
+                key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :], (0, 0, 1, 0)
+            )  # batch_size, blocks, embed_dim
             value = torch.nn.functional.pad(
-                value.view(batch_size, blocks, l // blocks, d)[:, :-1, -1, :], (0, 0, 1, 0)
-            )  # batch_size, blocks, d
-            return self.dense_attn(query, key, value, sample).view(batch_size, l, d)
+                value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :], (0, 0, 1, 0)
+            )  # batch_size, blocks, embed_dim
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def summary_spread_attn(self, query, key, value, sample):
         blocks, _, spread = (
             self.blocks,
             self.block_ctx,
             self.spread,
-        )  # block_ctx is l // blocks for complete l ie l = n_ctx. Sampling has less l
-        batch_size, l, d = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
             assert False, "Not yet implemented"
-            # key = torch.nn.functional.pad(k,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, d)
-            # value = torch.nn.functional.pad(value,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, d)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, d)
-            # return self.dense_attn(query, key, value, sample).view(batch_size, 1, d)
+            # key = torch.nn.functional.pad(k,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, embed_dim)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, embed_dim)
+            # value = torch.nn.functional.pad(value,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, embed_dim)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, embed_dim)
+            # return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             key = (
-                torch.nn.functional.pad(key.view(batch_size, blocks, l // blocks, d)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
+                torch.nn.functional.pad(key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
                 .contiguous()
-                .view(batch_size, blocks * spread, d)
-            )  # batch_size, blocks * spread, d
+                .view(batch_size, blocks * spread, embed_dim)
+            )  # batch_size, blocks * spread, embed_dim
             value = (
-                torch.nn.functional.pad(value.view(batch_size, blocks, l // blocks, d)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
+                torch.nn.functional.pad(value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
                 .contiguous()
-                .view(batch_size, blocks * spread, d)
-            )  # batch_size, blocks * spread, d
-            return self.dense_attn(query, key, value, sample).view(batch_size, l, d)
+                .view(batch_size, blocks * spread, embed_dim)
+            )  # batch_size, blocks * spread, embed_dim
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def prime_attn(self, query, key, value, sample):
         prime_len = self._prime_len
@@ -1118,10 +1114,10 @@ def _offset(self, curr_ctx):
         return (self.sample_t - curr_ctx) % self.block_ctx
 
     def _pad_to_block_ctx(self, hidden_states, query=False):
-        l = hidden_states.shape[1]
+        seq_len = hidden_states.shape[1]
         offset = self._offset(l) if query else 0
         n_blocks = (l + offset + self.block_ctx - 1) // self.block_ctx
-        pad = n_blocks * self.block_ctx - l - offset
+        pad = n_blocks * self.block_ctx - seq_len - offset
         if pad == 0 and offset == 0:
             return hidden_states
         else:
@@ -1379,16 +1375,16 @@ def _should_record_attn(layer_idx):
                 return record_attn
             return layer_idx in record_attn
 
-        for i, l in enumerate(self._attn_mods):
-            l.attn.record_attn = _should_record_attn(i)
+        for i, layer in enumerate(self._attn_mods):
+            layer.attn.record_attn = _should_record_attn(i)
         if record_attn:
             assert self.ws == []
-            for l in self._attn_mods:
-                assert l.attn.w is None
+            for layer in self._attn_mods:
+                assert layer.attn.w is None
         else:
             self.ws = []
-            for l in self._attn_mods:
-                l.attn.w = None
+            for layer in self._attn_mods:
+                layer.attn.w = None
 
     def forward(self, hidden_states, encoder_key_value=None, sample=False, fp16=False, fp16_out=False):
         if fp16:
@@ -1467,7 +1463,7 @@ def __init__(
         self.input_shape = input_shape
         self.input_dims = input_dims = np.prod(input_shape)
         self.encoder_dims = encoder_dims
-        # TODO rename bins
+        # TODO rename self.bins
         self.bins = bins
         self.width = width
         self.depth = depth
@@ -1476,7 +1472,6 @@ def __init__(
         self.x_emb = nn.Embedding(bins, width)
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
         self.x_emb_dropout = nn.Dropout(emb_dropout)
-        # TODO rename y and metadata_conditioning to proper names
         self.metadata_conditioning = metadata_conditioning
         self.audio_conditioning = audio_conditioning
         if not metadata_conditioning:
@@ -1522,6 +1517,7 @@ def __init__(
             self.add_cond_after_transformer = True
             self.share_x_emb_x_out = True
 
+        # TODO rename x_out to proj_out + x_embed
         if not only_encode:
             self.x_out = nn.Linear(width, bins, bias=False)
             if self.share_x_emb_x_out:
@@ -1543,7 +1539,7 @@ def postprocess(self, hidden_states, sample_tokens=None):
         else:
             return hidden_states.view(N, -1)
 
-    # TODO RENAME hidden_states, audio_conditioning and metadata_conditioning, x_prime, x_gen, x_t
+    # TODO RENAME x_prime, x_gen, target, x_emb
     def forward(
         self,
         hidden_states,
@@ -1565,7 +1561,7 @@ def forward(
         if not self.audio_conditioning:
             audio_conditioning = torch.zeros((N, 1, self.width), device=hidden_states.device, dtype=torch.float)
 
-        x_t = hidden_states  # Target
+        target = hidden_states  # Target
         hidden_states = self.x_emb(hidden_states)  # hidden_states emb
         hidden_states = roll(hidden_states, 1)  # Shift by 1, and fill in start token
         if self.metadata_conditioning:
@@ -1585,16 +1581,16 @@ def forward(
         hidden_states = self.x_out(hidden_states)  # Predictions
 
         if get_sep_loss:
-            assert self.prime_len is not None
+            # TODO rename x_prime and x_gen. Prime is related to primed sampling
             x_prime = hidden_states[:, : self.prime_len].reshape(-1, self.bins)
             x_gen = hidden_states[:, self.prime_len :].reshape(-1, self.bins)
 
-            prime_loss = F.cross_entropy(x_prime, x_t[:, : self.prime_len].reshape(-1)) / np.log(2.0)
-            gen_loss = F.cross_entropy(x_gen, x_t[:, self.prime_len :].reshape(-1)) / np.log(2.0)
+            prime_loss = F.cross_entropy(x_prime, target[:, : self.prime_len].reshape(-1)) / np.log(2.0)
+            gen_loss = F.cross_entropy(x_gen, target[:, self.prime_len :].reshape(-1)) / np.log(2.0)
 
             loss = (prime_loss, gen_loss)  # Note order! Prime is first
         else:
-            loss = F.cross_entropy(hidden_states.view(-1, self.bins), x_t.view(-1)) / np.log(2.0)  # Loss
+            loss = F.cross_entropy(hidden_states.view(-1, self.bins), target.view(-1)) / np.log(2.0)  # Loss
 
         if get_preds:
             return loss, hidden_states
@@ -1603,7 +1599,6 @@ def forward(
         else:
             return loss, None
 
-    # TODO rename hidden_states, x_conds, y_conds
     def get_emb(self, sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning):
         N, D = n_samples, self.input_dims
         if sample_t == 0:
@@ -1621,7 +1616,6 @@ def get_emb(self, sample_t, n_samples, hidden_states, audio_conditioning, metada
         hidden_states = hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond  # Pos emb, dropout is identity at eval time
         return hidden_states, cond
 
-    # TODO rename hidden_states, x_conds, y_conds
     def sample(
         self,
         n_samples,
@@ -1677,7 +1671,6 @@ def sample(
         else:
             return tokens
 
-    # TODO rename all
     def primed_sample(
         self,
         n_samples,
@@ -1724,6 +1717,7 @@ def primed_sample(
             for current_chunk_size in get_range(chunk_sizes):
                 sampled_audio_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
+                    # TODO rename x_prime, con_prime
                     x_prime, cond_prime = self.get_emb(sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning)
                     hidden_states = sampled_audio[sample_t]
                     sampled_audio_prime.append(x_prime)
@@ -1852,7 +1846,7 @@ def __init__(
         super().__init__()
         self.x_shape = input_shape
 
-        # Embedding
+        # TODO rename x_emb
         self.width = out_width
         self.x_emb = nn.Embedding(bins, out_width)
         nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
@@ -1926,6 +1920,7 @@ class RangeEmbedding(nn.Module):
     def __init__(self, n_time, bins, range, out_width, init_scale, clamp=False):
         super().__init__()
         self.n_time = n_time
+        # TODO rename bins
         self.bins = bins
         self.emb = nn.Embedding(bins, out_width)
         nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
@@ -1982,6 +1977,7 @@ def __init__(
         super().__init__()
         self.n_time = n_time
         self.out_width = out_width
+        # TODO rename bins
         bow_genre_bins, artist_bins = metadata_dims
         self.max_bow_genre_size = max_bow_genre_size
         self.bow_genre_emb = SimpleEmbedding(bow_genre_bins, out_width, init_scale)
@@ -2025,7 +2021,7 @@ def forward(self, metadata):
         return start_emb, pos_emb
 
 
-# TODO rename every conditioning
+# TODO rename l_bins, bins, prior_bins, prime_x_out
 class JukeboxPrior(nn.Module):
     """
     Model the prior on vquery codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
@@ -2063,6 +2059,7 @@ def rescale(music_tokens_shape):
 
         self.level = level
 
+        # TODO rename l_bins which is the lyrics tokens
         self.l_bins = config.l_bins
 
         prior_kwargs = dict(
@@ -2130,7 +2127,7 @@ def rescale(music_tokens_shape):
             max_bow_genre_size=config.max_bow_genre_size,
         )
 
-        # X conditioning
+        # Audio conditioning
         self.audio_conditioning = level != (self.levels - 1)
         self.cond_level = level + 1
 
@@ -2138,7 +2135,7 @@ def rescale(music_tokens_shape):
         self.metadata_conditioning = config.labels
 
         self.single_enc_dec = config.single_enc_dec[-level - 1]
-        # X conditioning : conditioning on music tokens (either from audio or from previous levels or both)
+        # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both)
         if self.audio_conditioning:
             self.conditioner_blocks = nn.ModuleList()
 
@@ -2163,6 +2160,7 @@ def conditioner_block(_level):
         if config.single_enc_dec[-level - 1]:
             # Single encoder-decoder transformer
             self.prior_shapes = [(self.n_tokens,), prior_kwargs.pop("input_shape")]
+            # TODO rename bins
             self.prior_bins = [prime_kwargs["bins"], prior_kwargs.pop("bins")]
             self.prior_dims = [np.prod(shape) for shape in self.prior_shapes]
             self.prior_bins_shift = np.cumsum([0, *self.prior_bins])[:-1]
@@ -2188,6 +2186,7 @@ def conditioner_block(_level):
         else:
             # Separate encoder-decoder transformer
             if self.n_tokens != 0 and self.use_tokens:
+                # TODO rename prime
                 prime_input_shape = (self.n_tokens,)
                 self.prime_loss_dims = np.prod(prime_input_shape)
                 self.prime_acts_width, self.prime_state_width = prime_kwargs["width"], prior_kwargs["width"]
@@ -2245,11 +2244,11 @@ def set_metadata_lyric_tokens(self, labels):
             # total_length, offset, duration):
             tokens_list = torch.zeros((labels.shape[0], self.n_tokens), dtype=torch.long, device=labels.device)
             indices_list = []  # whats the index of each current character in original array
-            for i in range(labels.shape[0]):
+            for idx in range(labels.shape[0]):
                 full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_bow_genre_size :]
-                total_length, offset, duration = labels[i, 0], labels[i, 1], labels[i, 2]
+                total_length, offset, duration = labels[idx, 0], labels[idx, 1], labels[idx, 2]
                 tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
-                tokens_list[i, :] = tokens
+                tokens_list[idx, :] = tokens
                 indices_list.append(indices)
 
             return torch.cat((labels[:, : 4 + self.metadata_embedding.max_bow_genre_size], tokens_list), dim=-1), indices_list
@@ -2292,7 +2291,8 @@ def prior_postprocess(self, music_tokens):
             )  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
 
         return sampled_audio[-1]
-
+    
+    # TODO Rename x_emb
     def x_emb(self, music_tokens_conds):
         music_tokens_conds = music_tokens_conds[: self.cond_level - self.level]
         audio_conditioning = None
@@ -2301,25 +2301,25 @@ def x_emb(self, music_tokens_conds):
         return audio_conditioning
 
     # should be removed as the vq-vae is no longer part of the prior
-    def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
-        if start_level is None:
-            start_level = self.level
-        if end_level is None:
-            end_level = self.levels
-        # Get latents
-        with torch.no_grad():
-            music_tokens = self.encoder(hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
-        return music_tokens
+    # def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
+    #     if start_level is None:
+    #         start_level = self.level
+    #     if end_level is None:
+    #         end_level = self.levels
+    #     # Get latents
+    #     with torch.no_grad():
+    #         music_tokens = self.encoder(hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+    #     return music_tokens
 
     # same as above, the va-vae is no longer part of the prior
-    def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
-        if start_level is None:
-            start_level = self.level
-        if end_level is None:
-            end_level = self.levels
-        with torch.no_grad():
-            x_out = self.decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
-        return x_out
+    # def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
+    #     if start_level is None:
+    #         start_level = self.level
+    #     if end_level is None:
+    #         end_level = self.levels
+    #     with torch.no_grad():
+    #         x_out = self.decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+    #     return x_out
 
     def get_cond(self, music_tokens_conds, metadata):
         if metadata is not None:
@@ -2350,9 +2350,11 @@ def sample(
 
         with torch.no_grad():
             # Currently audio_conditioning only uses immediately above layer
+            # TODO Rename prime
             audio_conditioning, metadata_conditioning, prime = self.get_cond(music_tokens_conds, metadata)
             if self.single_enc_dec:
                 if no_past_context:
+                    # TODO Rename prime
                     music_tokens, audio_conditioning = self.prior_preprocess([prime], [None, audio_conditioning])
                 else:
                     music_tokens, audio_conditioning = self.prior_preprocess([prime, music_tokens], [None, audio_conditioning])
@@ -2402,6 +2404,7 @@ def sample(
         return music_tokens
 
     def get_encoder_key_value(self, prime, fp16=False, sample=False):
+        # TODO Rename prime
         if self.n_tokens != 0 and self.use_tokens:
             if sample:
                 self.prime_prior = self.prime_prior.to(prime.device)
@@ -2456,6 +2459,7 @@ def music_tokens_forward(self, music_tokens, music_tokens_conds=[], metadata=Non
         if get_preds:
             metrics["preds"] = preds.clone().detach()
         if get_attn_weights:
+             # TODO Rename ws to something more meaningful
             ws = self.prior.transformer.ws
             self.prior.transformer.set_record_attn(False)
             return ws
@@ -2537,7 +2541,7 @@ def get_starts(total_length, n_ctx, hop_length):
     return starts
 
 
-# TODO fix this, consumes too much RAM
+# TODO fix this, consumes too much RAM so should probably be removed
 def get_alignment(music_tokens, labels, prior, level, fp16, hps):
     level = level - 1  # Top level used
     n_ctx = prior.n_ctx

From 4f1656906c6ed53c284fd56ba9f4cbd0f8410972 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 9 Aug 2022 18:21:54 +0000
Subject: [PATCH 083/196] MAJOR UPDATE FULL RENAMING

---
 .../models/jukebox/configuration_jukebox.py   |  12 +-
 .../models/jukebox/convert_jukebox.py         | 229 +++++---
 .../models/jukebox/modeling_jukebox.py        | 536 ++++++++++--------
 3 files changed, 457 insertions(+), 320 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 26ba8b06dbaeb..bab46ce84dc7e 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -176,14 +176,14 @@ def __init__(
         cond_zero_out=False,
         # args for the priors, 3 priors
         n_ctx=(8192, 8192, 8192),
-        t_bins=128,
+        t_bins=128, # TODO rename to timing_embed_dim
         downs_t=(3, 2, 2),
         strides_t=(2, 2, 2),
         single_enc_dec=[True, False, False],
         labels=False,
         merged_decoder=[True, False, False],
         priors_width=[4096, 2048, 1024],
-        l_bins=256,
+        latent_dim=2048,
         width=[4800, 1920, 128],
         depth=[79, 72, 72],
         n_heads=[8, 1, 1],
@@ -224,7 +224,7 @@ def __init__(
         m_attn=0.25,
         n_vocab=80,
         cond_m_conv=1,
-        max_bow_genre_size=1,  # this should only be in the tokenizer
+        max_bow_genre_size=1,  # TODO this should only be in the tokenizer
         name="AudioSamples",
         init_std=0.2,
         **kwargs,
@@ -300,8 +300,8 @@ def __init__(
         self.vq_vae_lmu = vq_vae_lmu
 
         self.vq_vae_commit = vq_vae_commit
-        self.spectral = spectral
-        self.multispectral = multispectral
+        # self.spectral = spectral
+        # self.multispectral = multispectral
 
         self.vq_vae_conv_block_depth = vq_vae_conv_block_depth
         self.vq_vae_conv_block_width = vq_vae_conv_block_width
@@ -316,7 +316,7 @@ def __init__(
         self.cond_zero_out = cond_zero_out
         self.n_ctx = n_ctx
         self.t_bins = t_bins
-        self.l_bins = l_bins
+        self.latent_dim = latent_dim
         self.downs_t = downs_t
         self.strides_t = strides_t
         self.single_enc_dec = single_enc_dec
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 55aac464a7e5e..281c48768df24 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -23,7 +23,7 @@
 import requests
 from transformers import JukeboxConfig, JukeboxModel
 from transformers.utils import logging
-
+import os
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -54,85 +54,156 @@ def rename_key(dct, old, new):
     ],
 }
 
-
+def replace_key(key) : 
+    if ".k." in key:   # replace vqvae.X.k with vqvae.X.codebook
+        return key.replace(".k.", ".codebook.")
+    elif ".y_emb." in key:
+            key = key.replace(".y_emb.", ".metadata_embedding.")
+      
+          
+# TODO right a clean conversion code using regex or replace 
+# depending on the most appropriate choice            
 def fix_jukebox_keys(state_dict, model_state_dict):
     new_dict = {}
+    model_unformatted_keys = {".".join(k.split('.')[2:]) for k in model_state_dict.keys()}
+    import re 
+    model_to_conv = {1:"conv1d_1", 3:"conv1d_2"}
+    re_cond_block = re.compile("conditioner_blocks.([\d]).cond.model.([\d]).([\d]).model.([\d])")
+    groups = re_cond_block.match(original_key).groups()
+    block_index = int(groups[0]) * 2 + int(groups[1])
+    re_new_key = f"conditioner_blocks.{groups[0]}.upsampler.upsample_block.{block_index}.resnet_block.{model_to_conv[groups[-1]]}"
+    
+    re_cond_block.sub(re_new_key,original_key)
+    
     for original_key, value in state_dict.items():
         key = original_key
-        wo_model = key.split("model")
-        if len(wo_model) == 2 and "encoders" in key:
-            if len(wo_model[1].split(".")) <= 3:
-                key = wo_model[0] + "proj_out." + wo_model[1].split(".")[-1]
-            else:
+        
+        if ".k." in key:
+            key = key.replace(".k.", ".codebook.")
+        
+        elif ".y_emb." in key:
+            key = key.replace(".y_emb.", ".metadata_embedding.")
+        else:
+            wo_model = key.split("model")
+            if len(wo_model) == 2 and "encoders" in key:
+                if len(wo_model[1].split(".")) <= 3:
+                    key = wo_model[0] + "proj_out." + wo_model[1].split(".")[-1]
+                else:
+                    block_index = str(
+                        int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2])
+                    )
+                    key = (
+                        wo_model[0]
+                        + "downsample_block."
+                        + block_index
+                        + "."
+                        + wo_model[1].split(".")[-1]
+                    )
+            elif len(wo_model) == 2 and "decoders" in key:
+                if len(wo_model[1].split(".")) <= 3:
+                    key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
+                else:
+                    block_index = str(
+                        int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
+                    )
+                    key = (
+                        wo_model[0]
+                        + "upsample_block."
+                        + block_index
+                        + "."
+                        + wo_model[1].split(".")[-1]
+                    )
+            elif len(wo_model) == 2 and "cond.model." in key:
+                if len(wo_model[1].split(".")) <= 3:
+                    key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
+                else:
+                    block_index = str(
+                        int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
+                    )
+                    key = (
+                        wo_model[0]
+                        + "upsample_block."
+                        + block_index
+                        + "."
+                        + wo_model[1].split(".")[-1]
+                    )
+            elif len(wo_model) == 3 and "priors" in key:
+                # should also rename cond to low_lvl_conditioner
+                block_index = str(
+                    int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
+                )
+                key = (
+                    wo_model[0]
+                    + "upsample_block."
+                    + block_index
+                    + ".resnet_block."
+                    + wo_model[1].split(".")[-2]
+                    + ".model"
+                    + wo_model[2]
+                )
+            elif len(wo_model) == 4 and "decoders" in key:
+                # convert from
+                # model.1.0 is the first upsample block's resnet layer. Then this
+                # layer has resnet_blocks (1 to 3) which has a sequential (last model). 3 is the 3nd conv
+                # vqvae.decoders.0.level_blocks.0.model.1.0.model.1.model.3.bias
+                # to
+                # vqvae.decoders.1.level_blocks.0.upsample_block.1.resnet_blocks.2.conv1d_2.weight
+                block_index = str(
+                    int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
+                )
+                key = (
+                    wo_model[0]
+                    + "upsample_block."
+                    + block_index
+                    + ".resnet_block."
+                    + wo_model[2].split(".")[1]
+                    + ".model"
+                    + wo_model[3]
+                )
+            elif len(wo_model) == 4 and "encoders" in key:
                 block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]))
-                key = wo_model[0] + "downsample_block." + block_index + "." + wo_model[1].split(".")[-1]
-        elif len(wo_model) == 2 and "decoders" in key:
-            if len(wo_model[1].split(".")) <= 3:
-                key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
-            else:
-                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
-                key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
-        elif len(wo_model) == 2 and "cond.model." in key:
-            if len(wo_model[1].split(".")) <= 3:
-                key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
-            else:
-                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
-                key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
-        elif len(wo_model) == 3 and "priors" in key:
-            # should also rename cond to low_lvl_conditioner
-            block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
-            key = (
-                wo_model[0]
-                + "upsample_block."
-                + block_index
-                + ".resnet_block."
-                + wo_model[1].split(".")[-2]
-                + ".model"
-                + wo_model[2]
-            )
-        elif len(wo_model) == 4 and "decoders" in key:
-            # convert from
-            # model.1.0 is the first upsample block's resnet layer. Then this
-            # layer has resnet_blocks (1 to 3) which has a sequential (last model). 3 is the 3nd conv
-            # vqvae.decoders.0.level_blocks.0.model.1.0.model.1.model.3.bias
-            # to
-            # vqvae.decoders.1.level_blocks.0.upsample_block.1.resnet_blocks.2.conv1d_2.weight
-            block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
-            key = (
-                wo_model[0]
-                + "upsample_block."
-                + block_index
-                + ".resnet_block."
-                + wo_model[2].split(".")[1]
-                + ".model"
-                + wo_model[3]
-            )
-        elif len(wo_model) == 4 and "encoders" in key:
-            block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]))
-            key = (
-                wo_model[0]
-                + "downsample_block."
-                + block_index
-                + ".resnet_block."
-                + wo_model[2].split(".")[1]
-                + ".model"
-                + wo_model[3]
-            )
-
-        if key.endswith(".model.1.bias") and len(key.split(".")) > 10:
-            key = key.replace(".model.1.bias", ".conv1d_1.bias")
-        elif key.endswith(".model.1.weight") and len(key.split(".")) > 10:
-            key = key.replace(".model.1.weight", ".conv1d_1.weight")
-        elif key.endswith(".model.3.bias") and len(key.split(".")) > 10:
-            key = key.replace(".model.3.bias", ".conv1d_2.bias")
-        elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
-            key = key.replace(".model.3.weight", ".conv1d_2.weight")
-
-        if key not in model_state_dict.keys():
+                key = (
+                    wo_model[0]
+                    + "downsample_block."
+                    + block_index
+                    + ".resnet_block."
+                    + wo_model[2].split(".")[1]
+                    + ".model"
+                    + wo_model[3]
+                )
+
+            if key.endswith(".model.1.bias") and len(key.split(".")) > 10:
+                key = key.replace(".model.1.bias", ".conv1d_1.bias")
+            elif key.endswith(".model.1.weight") and len(key.split(".")) > 10:
+                key = key.replace(".model.1.weight", ".conv1d_1.weight")
+            elif key.endswith(".model.3.bias") and len(key.split(".")) > 10:
+                key = key.replace(".model.3.bias", ".conv1d_2.bias")
+            elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
+                key = key.replace(".model.3.weight", ".conv1d_2.weight")
+                
+        if ".cond." in key : 
+            key = key.replace(".cond.", ".upsampler.")
+        if ".ln" in key : 
+            key = key.replace(".ln", ".layer_norm")
+        if "_ln" in key : 
+            key = key.replace("_ln", "_layer_norm")
+        if "prime_prior" in key:
+            key = key.replace("prime_prior","lyric_encoder")
+        if "prime_x_out" in key:
+            key = key.replace("prime_x_out","lyric_enc_proj_out")
+        # if "x_emb" in key:
+        #     key = key.replace("x_emb","lyric_enc_proj_out")
+        if not "conditioner_blocks" in key and "x_emb" in key:
+            key = key.replace("x_emb","lyric_enc.proj_out")
+        if key not in model_unformatted_keys:
             print(f"failed converting {original_key} to {key}, does not match")
-        elif value.shape != model_state_dict[key].shape:
-            print(f"{original_key}-> {key} : \nshape {model_state_dict[key].shape} and { value.shape}, do not match")
-            key = original_key
+        
+        # elif value.shape != model_state_dict[key].shape:
+        #     print(
+        #         f"{original_key}-> {key} : \nshape {model_unformatted_keys[key].shape} and"
+        #         f" { value.shape}, do not match"
+        #     )
+        #     key = original_key
         new_dict[key] = value
     return new_dict
 
@@ -143,15 +214,17 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     Copy/paste/tweak model's weights to our Jukebox structure.
     """
     for file in MODEL_MAPPING[model_name]:
-        r = requests.get(f"{PREFIX}{file}", allow_redirects=True)
-        open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content)
+        if not os.path.isfile(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}"):
+            r = requests.get(f"{PREFIX}{file}", allow_redirects=True)
+            os.makedirs(f"{pytorch_dump_folder_path}/",exist_ok=True)
+            open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content)
 
     vqvae, *priors = MODEL_MAPPING[model_name.split("/")[-1]]
     vqvae_dic = torch.load(f"{pytorch_dump_folder_path}/{vqvae.split('/')[-1]}", map_location=torch.device("cpu"))[
         "model"
     ]
 
-    config = JukeboxConfig.from_pretrained(model_name)
+    config = JukeboxConfig.from_pretrained("ArthurZ/"+model_name)
     model = JukeboxModel(config)
 
     weight_dict = []
@@ -193,7 +266,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         help="Name of the model you'd like to convert.",
     )
     parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+        "--pytorch_dump_folder_path", default="converted_model", type=str, help="Path to the output PyTorch model directory."
     )
     args = parser.parse_args()
     convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 90748e19465ea..147652d8ffcfb 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -308,6 +308,7 @@ def level_block(level, down_t, stride_t):
         for level, down_t, stride_t in iterator:
             self.level_blocks.append(level_block(level, down_t, stride_t))
 
+        #TODO rename to proj out
         self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
 
     def forward(self, hidden_states, all_levels=True):
@@ -472,7 +473,7 @@ def decode(self, music_tokens):
         return dequantised_states
 
     def forward(self, hidden_states, update_codebook=True):
-        samples, width, seq_len = hidden_states.shape
+        samples, _, seq_len = hidden_states.shape
 
         # Preprocess
         hidden_states, prenorm = self.preprocess(hidden_states)
@@ -678,7 +679,6 @@ def sample(self, n_samples):
         ]
         return self.decode(music_tokens)
 
-    # TODO rename
     def forward(self, raw_audio):
         # Encode/Decode
         input_audio = self.preprocess(raw_audio)
@@ -961,16 +961,16 @@ def prev_block_attn(self, query, key, value, sample):
         )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
         batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
-            assert seq_len == self._suff_cache_len(), f"{l} != {self._suff_cache_len()}"
-            block = (l - 1) // block_ctx
+            assert seq_len == self._suff_cache_len(), f"{seq_len} != {self._suff_cache_len()}"
+            block = (seq_len - 1) // block_ctx
             prev_l = (block - 1) * block_ctx
             if block > 0:
                 assert prev_l == 0
                 key = key[:, prev_l : prev_l + block_ctx, :]
                 value = value[:, prev_l : prev_l + block_ctx, :]
             else:
-                key = torch.zeros(batch_size, block_ctx, d, device=query.device, dtype=query.dtype)
-                value = torch.zeros(batch_size, block_ctx, d, device=query.device, dtype=query.dtype)
+                key = torch.zeros(batch_size, block_ctx, embed_dim, device=query.device, dtype=query.dtype)
+                value = torch.zeros(batch_size, block_ctx, embed_dim, device=query.device, dtype=query.dtype)
             return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             query_length = query.shape[1]
@@ -1045,9 +1045,9 @@ def decode_attn(self, query, key, value, sample):
         ), f"k: {key.shape}, v: {value.shape}, enc_dims: {self.encoder_dims}"
         return self.dense_attn(query, key, value, sample)
 
-    def factored_qkv(self, hidden_states, encoder_key_value=None, sample=False):
+    def factored_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
-        assert encoder_key_value is None
+        assert lyric_encoder_states is None
         query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
             self.sample_t += curr_ctx
@@ -1066,9 +1066,9 @@ def factored_qkv(self, hidden_states, encoder_key_value=None, sample=False):
                 value = self.cache["value"]
         return query, key, value, sample
 
-    def prime_qkv(self, hidden_states, encoder_key_value=None, sample=False):
+    def prime_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
-        assert encoder_key_value is None
+        assert lyric_encoder_states is None
         query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
             if self._cache_len() < self._prime_len:
@@ -1079,22 +1079,22 @@ def prime_qkv(self, hidden_states, encoder_key_value=None, sample=False):
             self.sample_t += curr_ctx
         return query, key, value, sample
 
-    def decode_qkv(self, hidden_states, encoder_key_value=None, sample=False):
+    def decode_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
         query = hidden_states
         if sample:
             if self.sample_t == 0:
-                self.cache["key"], self.cache["value"] = self.c_enc_kv(encoder_key_value.type_as(hidden_states)).chunk(2, dim=2)
+                self.cache["key"], self.cache["value"] = self.c_enc_kv(lyric_encoder_states.type_as(hidden_states)).chunk(2, dim=2)
             key, value = self.cache["key"], self.cache["value"]
             self.sample_t += curr_ctx
         else:
-            key, value = self.c_enc_kv(encoder_key_value.type_as(hidden_states)).chunk(2, dim=2)
+            key, value = self.c_enc_kv(lyric_encoder_states.type_as(hidden_states)).chunk(2, dim=2)
         return query, key, value, sample
 
-    def forward(self, hidden_states, encoder_key_value=None, sample=False):
+    def forward(self, hidden_states, lyric_encoder_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
         hidden_states = self.c_attn(hidden_states)
-        query, key, value, sample = self.qkv(hidden_states, encoder_key_value=encoder_key_value, sample=sample)
+        query, key, value, sample = self.qkv(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=sample)
         a = self.attn(query, key, value, sample)
         if a.shape[1] != curr_ctx:
             offset = self._offset(curr_ctx)
@@ -1115,8 +1115,8 @@ def _offset(self, curr_ctx):
 
     def _pad_to_block_ctx(self, hidden_states, query=False):
         seq_len = hidden_states.shape[1]
-        offset = self._offset(l) if query else 0
-        n_blocks = (l + offset + self.block_ctx - 1) // self.block_ctx
+        offset = self._offset(seq_len) if query else 0
+        n_blocks = (seq_len + offset + self.block_ctx - 1) // self.block_ctx
         pad = n_blocks * self.block_ctx - seq_len - offset
         if pad == 0 and offset == 0:
             return hidden_states
@@ -1256,10 +1256,10 @@ def __init__(
         self.width = width
         self.attn_func = attn_func
 
-    def forward(self, hidden_states, encoder_key_value, sample=False):
+    def forward(self, hidden_states, lyric_encoder_states, sample=False):
         residuals = hidden_states
         hidden_states = self.layer_norm_0(hidden_states)
-        hidden_states = self.attn(hidden_states, encoder_key_value, sample)
+        hidden_states = self.attn(hidden_states, lyric_encoder_states, sample)
 
         output_states = self.layer_norm_1(residuals + hidden_states)
         output_states = self.mlp(output_states)
@@ -1360,13 +1360,14 @@ def attn_block(d):
         self._attn_mods = nn.ModuleList()
         for d in range(n_depth):
             self._attn_mods.append(attn_block(d))
-        self.ws = []
+            
+        self.saved_attn_weights = []
 
     def set_record_attn(self, record_attn):
         """
         Arguments:
             record_attn (bool or set): Makes forward prop dump self-attention
-                softmaxes to self.ws. Either a set of layer indices indicating which layers to store, or a boolean
+                softmaxes to self.saved_attn_weights. Either a set of layer indices indicating which layers to store, or a boolean
                 value indicating whether to dump all.
         """
 
@@ -1378,26 +1379,26 @@ def _should_record_attn(layer_idx):
         for i, layer in enumerate(self._attn_mods):
             layer.attn.record_attn = _should_record_attn(i)
         if record_attn:
-            assert self.ws == []
+            assert self.saved_attn_weights == []
             for layer in self._attn_mods:
                 assert layer.attn.w is None
         else:
-            self.ws = []
+            self.saved_attn_weights = []
             for layer in self._attn_mods:
                 layer.attn.w = None
 
-    def forward(self, hidden_states, encoder_key_value=None, sample=False, fp16=False, fp16_out=False):
+    def forward(self, hidden_states, lyric_encoder_states=None, sample=False, fp16=False, fp16_out=False):
         if fp16:
             hidden_states = hidden_states.half()
 
         # Blocks
         for i, attn_layer in enumerate(self._attn_mods):
-            if attn_layer.attn_func == 6:
-                hidden_states = attn_layer(hidden_states, encoder_key_value=encoder_key_value, sample=sample)
+            if attn_layer.attn_func == 6: # attend to the lyrics
+                hidden_states = attn_layer(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=sample)
             else:
-                hidden_states = attn_layer(hidden_states, encoder_key_value=None, sample=sample)
+                hidden_states = attn_layer(hidden_states, lyric_encoder_states=None, sample=sample)
             if attn_layer.attn.record_attn:
-                self.ws.append(attn_layer.attn.w)
+                self.saved_attn_weights.append(attn_layer.attn.w)
         if not fp16_out:
             hidden_states = hidden_states.float()
         return hidden_states
@@ -1415,9 +1416,9 @@ class JukeboxPositionalEmbedding(nn.Module):
     def __init__(self, input_shape, width, init_scale=1.0, pos_init=False):
         super().__init__()
         self.input_shape = input_shape
-        self.input_dims = input_dims = np.prod(input_shape)
+        self.input_dims = np.prod(input_shape)
         self.pos_init = pos_init
-        self.pos_emb = nn.Parameter(get_normal(input_dims, width, std=0.01 * init_scale))
+        self.pos_emb = nn.Parameter(get_normal(self.input_dims, width, std=0.01 * init_scale))
 
     def forward(self):
         if self.pos_init:
@@ -1432,7 +1433,7 @@ class JukeboxConditionalAutoregressive(nn.Module):
     def __init__(
         self,
         input_shape,
-        bins,
+        embed_dim,
         width=128,
         depth=2,
         heads=1,
@@ -1459,19 +1460,27 @@ def __init__(
         merged_decoder=False,
         prime_len=None,
     ):
+        """
+            - input_shape : respective dimension of the different inputs (lyrics/music_tokens) 
+            - embed_dim : either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook
+            dimension, if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder 
+            for the lyric tokens.
+            - encoder_dims : input dimension of the lyric encoder. 
+            - audio_conditioning : whether or not the prior supports conditionning on audio. 
+            - metadata_conditioning : whether or not the prior supports conditionning on artitst, genres, lyrics and timing. When 
+            False, the start token is random.  
+        """
         super().__init__()
         self.input_shape = input_shape
         self.input_dims = input_dims = np.prod(input_shape)
-        self.encoder_dims = encoder_dims
-        # TODO rename self.bins
-        self.bins = bins
+        self.encoder_dims = encoder_dims 
+        self.embed_dim = embed_dim
         self.width = width
         self.depth = depth
 
-        # TODO  rename x_emb to proper name, as well as x_out 
-        self.x_emb = nn.Embedding(bins, width)
-        nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
-        self.x_emb_dropout = nn.Dropout(emb_dropout)
+        self.embed_tokens = nn.Embedding(embed_dim, width)
+        nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
+        self.embed_tokens_dropout = nn.Dropout(emb_dropout)
         self.metadata_conditioning = metadata_conditioning
         self.audio_conditioning = audio_conditioning
         if not metadata_conditioning:
@@ -1506,46 +1515,44 @@ def __init__(
             encoder_dims=encoder_dims,
             prime_len=prime_len,
         )
-
+        # TODO rename prime_len
         self.only_encode = only_encode
         self.prime_len = prime_len
         if merged_decoder:
             # Merged piped model uses this setup
             self.add_cond_after_transformer = False
-            self.share_x_emb_x_out = False
+            self.share_embed_tokens_fc_proj_out = False
         else:
             self.add_cond_after_transformer = True
-            self.share_x_emb_x_out = True
+            self.share_embed_tokens_fc_proj_out = True
 
-        # TODO rename x_out to proj_out + x_embed
         if not only_encode:
-            self.x_out = nn.Linear(width, bins, bias=False)
-            if self.share_x_emb_x_out:
-                self.x_out.weight = self.x_emb.weight
+            self.fc_proj_out = nn.Linear(width, embed_dim, bias=False)
+            if self.share_embed_tokens_fc_proj_out:
+                self.fc_proj_out.weight = self.embed_tokens.weight
             self.loss = torch.nn.CrossEntropyLoss()
 
-    def preprocess(self, hidden_states):
+    def preprocess(self, tokens):
         # Input: hidden_states is NHWC and uint8. Converted to NL and long
         # Can include stuff like bitpacking, reordering here.
-        N = hidden_states.shape[0]
-        return hidden_states.view(N, -1).long()
+        N = tokens.shape[0]
+        return tokens.view(N, -1).long()
 
-    def postprocess(self, hidden_states, sample_tokens=None):
+    def postprocess(self, tokens, sample_tokens=None):
         # Convert back from NL and long to NHWC
-        N = hidden_states.shape[0]
-        assert (0 <= hidden_states).all() and (hidden_states < self.bins).all()
+        N = tokens.shape[0]
+        assert (0 <= tokens).all() and (tokens < self.embed_dim).all()
         if sample_tokens is None or sample_tokens == self.input_dims:
-            return hidden_states.view(N, *self.input_shape)
+            return tokens.view(N, *self.input_shape)
         else:
-            return hidden_states.view(N, -1)
+            return tokens.view(N, -1)
 
-    # TODO RENAME x_prime, x_gen, target, x_emb
     def forward(
         self,
-        hidden_states,
+        tokens,
         audio_conditioning=None,
         metadata_conditioning=None,
-        encoder_key_value=None,
+        lyric_encoder_states=None,
         fp16=False,
         loss_full=False,
         encode=False,
@@ -1553,44 +1560,48 @@ def forward(
         get_acts=False,
         get_sep_loss=False,
     ):
+        """
+            - tokens : composed of both music tokens and lyrics tokens or just music tokens 
+        """
         # Preprocess.
         with torch.no_grad():
-            hidden_states = self.preprocess(hidden_states)
+            tokens = self.preprocess(tokens)
 
         N = hidden_states.shape[0]
         if not self.audio_conditioning:
-            audio_conditioning = torch.zeros((N, 1, self.width), device=hidden_states.device, dtype=torch.float)
+            audio_conditioning = torch.zeros((N, 1, self.width), device=tokens.device, dtype=torch.float)
 
-        target = hidden_states  # Target
-        hidden_states = self.x_emb(hidden_states)  # hidden_states emb
+        target = tokens  # Target
+        hidden_states = self.embed_tokens(tokens)  # music_tokens embedding
         hidden_states = roll(hidden_states, 1)  # Shift by 1, and fill in start token
         if self.metadata_conditioning:
             hidden_states[:, 0] = metadata_conditioning.view(N, self.width)
         else:
             hidden_states[:, 0] = self.start_token
 
-        hidden_states = self.x_emb_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning  # Pos emb and dropout
+        hidden_states = self.embed_tokens_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning  # Pos emb and dropout
 
-        hidden_states = self.transformer(hidden_states, encoder_key_value=encoder_key_value, fp16=fp16)  # Transformer
+        hidden_states = self.transformer(hidden_states, lyric_encoder_states=lyric_encoder_states, fp16=fp16)  # Transformer
         if self.add_cond_after_transformer:  # Piped doesnt add x_cond
             hidden_states = hidden_states + audio_conditioning
 
         acts = hidden_states
         if self.only_encode:
             return hidden_states
-        hidden_states = self.x_out(hidden_states)  # Predictions
+        hidden_states = self.fc_proj_out(hidden_states)  # Predictions
 
         if get_sep_loss:
             # TODO rename x_prime and x_gen. Prime is related to primed sampling
-            x_prime = hidden_states[:, : self.prime_len].reshape(-1, self.bins)
-            x_gen = hidden_states[:, self.prime_len :].reshape(-1, self.bins)
+            # TODO rename prime_length, prime_loss  (related au primed_sample)
+            x_prime = hidden_states[:, : self.prime_len].reshape(-1, self.embed_dim)
+            x_gen = hidden_states[:, self.prime_len :].reshape(-1, self.embed_dim)
 
             prime_loss = F.cross_entropy(x_prime, target[:, : self.prime_len].reshape(-1)) / np.log(2.0)
             gen_loss = F.cross_entropy(x_gen, target[:, self.prime_len :].reshape(-1)) / np.log(2.0)
 
             loss = (prime_loss, gen_loss)  # Note order! Prime is first
         else:
-            loss = F.cross_entropy(hidden_states.view(-1, self.bins), target.view(-1)) / np.log(2.0)  # Loss
+            loss = F.cross_entropy(hidden_states.view(-1, self.embed_dim), target.view(-1)) / np.log(2.0)  # Loss
 
         if get_preds:
             return loss, hidden_states
@@ -1599,7 +1610,7 @@ def forward(
         else:
             return loss, None
 
-    def get_emb(self, sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning):
+    def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning):
         N, D = n_samples, self.input_dims
         if sample_t == 0:
             hidden_states = torch.empty(n_samples, 1, self.width).to(audio_conditioning.device)
@@ -1608,7 +1619,7 @@ def get_emb(self, sample_t, n_samples, hidden_states, audio_conditioning, metada
             else:
                 hidden_states[:, 0] = self.start_token
         else:
-            hidden_states = self.x_emb(hidden_states)
+            hidden_states = self.embed_tokens(tokens)
         if audio_conditioning.shape == (N, D, self.width):
             cond = audio_conditioning[:, sample_t : sample_t + 1, :]
         else:
@@ -1621,7 +1632,7 @@ def sample(
         n_samples,
         audio_conditioning=None,
         metadata_conditioning=None,
-        encoder_key_value=None,
+        lyric_encoder_states=None,
         fp16=False,
         temp=1.0,
         top_k=0,
@@ -1647,11 +1658,11 @@ def sample(
                 hidden_states, cond = self.get_emb(sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 hidden_states = self.transformer(
-                    hidden_states, encoder_key_value=encoder_key_value, sample=True, fp16=fp16
-                )  # TODO put fp16 back # Transformer
+                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16
+                )
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
-                hidden_states = self.x_out(hidden_states)  # Predictions
+                hidden_states = self.fc_proj_out(hidden_states)  # Predictions
                 if get_preds:
                     preds.append(hidden_states.clone())
                 # Adjust logits
@@ -1677,7 +1688,7 @@ def primed_sample(
         hidden_states,
         audio_conditioning=None,
         metadata_conditioning=None,
-        encoder_key_value=None,
+        lyric_encoder_states=None,
         fp16=False,
         temp=1.0,
         top_k=0,
@@ -1729,7 +1740,7 @@ def primed_sample(
                 del conds_prime
                 if not get_preds:
                     del cond_prime
-                x_prime = self.transformer(x_prime, encoder_key_value=encoder_key_value, sample=True, fp16=fp16)
+                x_prime = self.transformer(x_prime, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16)
 
                 if get_preds:
                     if self.add_cond_after_transformer:
@@ -1741,7 +1752,7 @@ def primed_sample(
 
             if get_preds:
                 x_prime = torch.cat(x_primes, dim=1)
-                x_prime = self.x_out(x_prime)  # Predictions
+                x_prime = self.fc_proj_out(x_prime)  # Predictions
                 preds.append(x_prime)
 
             empty_cache()
@@ -1753,10 +1764,10 @@ def primed_sample(
             for sample_t in get_range(range(len(sampled_audio), sample_tokens)):
                 hidden_states, cond = self.get_emb(sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning)
                 self.transformer.check_cache(n_samples, sample_t, fp16)
-                hidden_states = self.transformer(hidden_states, encoder_key_value=encoder_key_value, sample=True, fp16=fp16)  # Transformer
+                hidden_states = self.transformer(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16)  # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
-                hidden_states = self.x_out(hidden_states)  # Predictions
+                hidden_states = self.fc_proj_out(hidden_states)  # Predictions
                 if get_preds:
                     preds.append(hidden_states)
                 # Adjust logits
@@ -1836,26 +1847,24 @@ class MusicTokenConditioner(nn.Module):
     The MusicTokenConditioner takes music tokens as an input (coresponding to vocabularies in the VQ-VAE codebook) and
     upsamples it using a single layer of decoder convolution block (the same is used in the VQ-VAE).
 
-    The tokens are passed through an embedding layer and the embeddings are upsampled.
+    The embedding layer is different from the vaqvae's bottleneck
 
     """
-
+    # TODO check why embed_dim is initialized to config.latent_dim which is 2048 = to codebook_di. is it 
+    # latent_dim? 
     def __init__(
-        self, input_shape, bins, down_t, stride_t, out_width, init_scale, zero_out, res_scale, **block_kwargs
+        self, input_shape, embed_dim, down_t, stride_t, out_width, init_scale, zero_out, res_scale, **block_kwargs
     ):
         super().__init__()
-        self.x_shape = input_shape
-
-        # TODO rename x_emb
+        # self.x_shape = input_shape # is this needed? 
         self.width = out_width
-        self.x_emb = nn.Embedding(bins, out_width)
-        nn.init.normal_(self.x_emb.weight, std=0.02 * init_scale)
+        self.embed_tokens = nn.Embedding(embed_dim, out_width)
+        nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
 
         # MusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level?
         self.upsampler = DecoderConvBock(
             self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale
         )
-        # TODO rename all layer_norm to layer_norm
         self.layer_norm = JukeboxLayerNorm(self.width)
 
     def preprocess(self, hidden_states):
@@ -1866,19 +1875,18 @@ def postprocess(self, hidden_states):
         hidden_states = hidden_states.permute(0, 2, 1)  # NCT -> NTC
         return hidden_states
 
-    # TODO rename to raw audio and hidden states
     def forward(self, music_tokens, raw_audio_conditionning=None):
         """
         Args :
-            - music_tokens : indexes of codebook vectors
+            - music_tokens : int or long, in range(codebook_dim)
             - raw_audio_conditionning : used when prime sampling, raw audio information that conditions
             the generation
         """
         if raw_audio_conditionning is None:
             raw_audio_conditionning = 0.0
-        # Embed hidden_states
+        # Embed music_tokens
         music_tokens = music_tokens.long()
-        hidden_states = self.x_emb(music_tokens)
+        hidden_states = self.embed_tokens(music_tokens)
         hidden_states = hidden_states + raw_audio_conditionning
 
         # Run conditioner
@@ -1899,10 +1907,10 @@ def _flip(hidden_states):
 
 
 class SimpleEmbedding(nn.Module):
-    def __init__(self, bins, out_width, init_scale):
+    def __init__(self, embed_dim, out_width, init_scale):
         super().__init__()
-        self.bins = bins
-        self.emb = nn.Embedding(bins, out_width)
+        self.embed_dim = embed_dim
+        self.emb = nn.Embedding(embed_dim, out_width)
 
     def forward(self, y):
         return self.emb(y)
@@ -1917,12 +1925,11 @@ class RangeEmbedding(nn.Module):
     # [start,end) mapped to [0,1,...,bins-1]
     # [start,end) -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1]
     # NOTE: Open ended interval on right, so start <= pos < end, not <= end
-    def __init__(self, n_time, bins, range, out_width, init_scale, clamp=False):
+    def __init__(self, n_time, embed_dim, range, out_width, init_scale, clamp=False):
         super().__init__()
         self.n_time = n_time
-        # TODO rename bins
-        self.bins = bins
-        self.emb = nn.Embedding(bins, out_width)
+        self.embed_dim = embed_dim
+        self.emb = nn.Embedding(embed_dim, out_width)
         nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
         self.pos_min, self.pos_max = range
         self.clamp = clamp
@@ -1953,13 +1960,12 @@ def forward(self, pos_start, pos_end=None):
         else:
             position = pos_start
 
-        # Bin each value to bins
+        # Bin each value to bins_
         normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min)  # [0,1)
-        bins = (self.bins * normalised_position).floor().long().detach()  # [0,1) -> [0,1..,bins) -> [0,1...,bins-1]
-        return self.emb(bins)
+        bins_ = (self.embed_dim * normalised_position).floor().long().detach()  # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1]
+        return self.emb(bins_)
 
 
-# TODO rename y_bins and timing_dims as well as y
 class LabelConditioner(nn.Module):
     def __init__(
         self,
@@ -2021,10 +2027,10 @@ def forward(self, metadata):
         return start_emb, pos_emb
 
 
-# TODO rename l_bins, bins, prior_bins, prime_x_out
+
 class JukeboxPrior(nn.Module):
     """
-    Model the prior on vquery codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
+    Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
     on the timing, genre and artist, we use the LabelConditioner class To condition on the codes from the level above,
     we use the MusicTokenConditioner class To condition on lyrics, we allow two types of priors:
     - Separate Encoder Decoder: This is the usual encoder-decoder style transformer. The encoder transformer
@@ -2032,12 +2038,21 @@ class JukeboxPrior(nn.Module):
     models the lyrics, and we use its last layer to produce keys/values that are attened to by the decoder transformer
     - Single Encoder Decoder: This is a simplification where we combine them into a single model. We merge the text
       vocab
-    and Vquery vocab into a single large vocab, and the lyric tokens and Vquery tokens into a single longer sequence of tokens
-    which we autoregressively model together.
+    and VQ vocab into a single large vocab, and the lyric tokens and VQ tokens into a single longer sequence of tokens
+    which we autoregressively model together. # TODO this explains the input bins that are different from the lower lvl transformers. 
+    
+    Question : why are the embeddings from the vq-vae not used? Or am I crazy? In the forward it is used, but not in the primed sample
+    or sample functions. If the model is not trained using these/ uses the forward differently then I guess it is fine but otherwise it looks strange.
     """
 
-    def __init__(self, config, level):
+    def __init__(self, config, level, encoder = None, decoder = None):
         super().__init__()
+        
+        # Passing functions instead of the vqvae module to avoid getting params, only used in the 
+        # forward loop 
+        self.encoder = encoder
+        self.decoder = decoder
+        
         vqvae_music_tokens_shapes = config.vqvae_music_tokens_shapes
 
         def rescale(music_tokens_shape):
@@ -2046,11 +2061,13 @@ def rescale(music_tokens_shape):
         music_tokens_shapes = [rescale(music_tokens_shape) for music_tokens_shape in vqvae_music_tokens_shapes]
         self.use_tokens = config.use_tokens[-level - 1]
         self.n_tokens = config.n_tokens[-level - 1]
+        
+        # TODO rename prime loss fraction 
         self.prime_loss_fraction = config.prime_loss_fraction[-level - 1]
 
         self.copy_input = config.copy_input
         if self.copy_input:
-            config.bins = config.l_bins
+            config.bins = config.latent_dim
 
         self.music_tokens_shapes = music_tokens_shapes
         self.levels = len(self.music_tokens_shapes)
@@ -2058,13 +2075,12 @@ def rescale(music_tokens_shape):
         self.music_tokens_shape = self.music_tokens_shapes[level]
 
         self.level = level
-
-        # TODO rename l_bins which is the lyrics tokens
-        self.l_bins = config.l_bins
+        
+        self.latent_dim = config.latent_dim
 
         prior_kwargs = dict(
             input_shape=(config.n_ctx[-level - 1],),
-            bins=config.l_bins,
+            embed_dim=config.latent_dim,
             width=config.width[-level - 1],
             depth=config.depth[-level - 1],
             heads=config.n_heads[-level - 1], # TODO Rename in config
@@ -2082,8 +2098,10 @@ def rescale(music_tokens_shape):
         )
 
         if config.use_tokens and not config.single_enc_dec[-level - 1]:
-            prime_kwargs = dict(
-                bins=config.n_vocab,
+            # TODO rename to encoder_kwargs as they are used both 
+            # when single and not
+            lyric_enc_kwargs = dict(
+                n_vocab=config.n_vocab,
                 width=config.prime_width[-level - 1],
                 depth=config.prime_depth[-level - 1],
                 heads=config.prime_heads,
@@ -2101,7 +2119,7 @@ def rescale(music_tokens_shape):
                 m_mlp=config.prime_m_mlp,
             )
         else:
-            prime_kwargs = dict(bins=config.n_vocab)
+            lyric_enc_kwargs = dict(n_vocab=config.n_vocab)
 
         audio_conditioning_kwargs = dict(
             out_width=config.width[-level - 1],
@@ -2119,8 +2137,8 @@ def rescale(music_tokens_shape):
         metadata_conditioning_kwargs = dict(
             out_width=config.width[-level - 1],
             init_scale=config.init_scale[-level - 1],
-            metadata_dims=config.y_bins[-level - 1], # rename to metadata_bins
-            timing_dims=config.t_bins, # rename to timing_dims
+            metadata_dims=config.y_bins[-level - 1], # rename to metadata_dims
+            timing_dims=config.t_bins, # rename to timing_dims or timing_intervals
             sr=config.sr,
             min_duration=config.min_duration,
             max_duration=config.max_duration,
@@ -2132,7 +2150,7 @@ def rescale(music_tokens_shape):
         self.cond_level = level + 1
 
         # metadata conditioning
-        self.metadata_conditioning = config.labels
+        self.metadata_conditioning = config.labels # TODO change config
 
         self.single_enc_dec = config.single_enc_dec[-level - 1]
         # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both)
@@ -2142,7 +2160,7 @@ def rescale(music_tokens_shape):
             def conditioner_block(_level):
                 return MusicTokenConditioner(
                     input_shape=music_tokens_shapes[_level],
-                    bins=config.l_bins,
+                    embed_dim=config.latent_dim,        # TODO should we remove in favor of the vqvae dim? Maybe not
                     down_t=config.downs_t[_level],
                     stride_t=config.strides_t[_level],
                     **audio_conditioning_kwargs,
@@ -2156,56 +2174,60 @@ def conditioner_block(_level):
             self.n_time = self.music_tokens_shape[0]  # Assuming STFT=TF order and raw=T1 order, so T is first dim
             self.metadata_embedding = LabelConditioner(n_time=self.n_time, include_time_signal=not self.audio_conditioning, **metadata_conditioning_kwargs)
 
-        # Lyric conditioning
+        # TODO as the prior type can change, can't rename to decoder or enc_dec
         if config.single_enc_dec[-level - 1]:
             # Single encoder-decoder transformer
             self.prior_shapes = [(self.n_tokens,), prior_kwargs.pop("input_shape")]
-            # TODO rename bins
-            self.prior_bins = [prime_kwargs["bins"], prior_kwargs.pop("bins")]
+            self.prior_embed_dim = [lyric_enc_kwargs["n_vocab"], prior_kwargs.pop("embed_dim")]
             self.prior_dims = [np.prod(shape) for shape in self.prior_shapes]
-            self.prior_bins_shift = np.cumsum([0, *self.prior_bins])[:-1]
+            self.prior_embed_dim_shift = np.cumsum([0, *self.prior_embed_dim])[:-1]
             self.prior_width = prior_kwargs["width"]
-            print(f"Creating cond. autoregress with prior bins {self.prior_bins}, ")
-            print(f"dims {self.prior_dims}, ")
-            print(f"shift {self.prior_bins_shift}")
-            print(f"input shape {sum(self.prior_dims)}")
-            print(f"input bins {sum(self.prior_bins)}")
-            print(f"Self copy is {self.copy_input}")
-
-            self.prime_loss_dims, self.gen_loss_dims = self.prior_dims[0], self.prior_dims[1]
-            self.total_loss_dims = self.prime_loss_dims + self.gen_loss_dims
+            
+            # print(f"Creating cond. autoregress with prior embed_dim {self.prior_embed_dim}, ")
+            # print(f"dims {self.prior_dims}, ")
+            # print(f"shift {self.prior_embed_dim_shift}")
+            # print(f"input shape {sum(self.prior_dims)}")
+            # print(f"input embed_dim (vocab size of the embedding layer) {sum(self.prior_embed_dim)}")
+            # print(f"Self copy is {self.copy_input}")
+
+            # lyrics_enc_loss_dims was the prime loss dims, gen is for the generated tokens. 
+            # what is the shape of the lyrics loss? 
+            
+            self.lyrics_enc_loss_dims, self.gen_loss_dims = self.prior_dims[0], self.prior_dims[1]
+            self.total_loss_dims = self.lyrics_enc_loss_dims + self.gen_loss_dims
             self.prior = JukeboxConditionalAutoregressive(
                 input_shape=(sum(self.prior_dims),),
-                bins=sum(self.prior_bins),
+                embed_dim=sum(self.prior_embed_dim),
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
                 metadata_conditioning=True,
-                prime_len=self.prime_loss_dims,
+                prime_len=self.lyrics_enc_loss_dims,
                 **prior_kwargs,
             )
 
         else:
             # Separate encoder-decoder transformer
             if self.n_tokens != 0 and self.use_tokens:
-                # TODO rename prime
-                prime_input_shape = (self.n_tokens,)
-                self.prime_loss_dims = np.prod(prime_input_shape)
-                self.prime_acts_width, self.prime_state_width = prime_kwargs["width"], prior_kwargs["width"]
-                self.prime_prior = JukeboxConditionalAutoregressive(
-                    input_shape=prime_input_shape, audio_conditioning=False, metadata_conditioning=False, only_encode=True, **prime_kwargs
+                lyric_enc_input_shape = (self.n_tokens,)
+                self.lyrics_enc_loss_dims = np.prod(lyric_enc_input_shape)
+                self.lyric_acts_width, self.lyric_enc_width = lyric_enc_kwargs["width"], prior_kwargs["width"]
+                self.lyric_encoder = JukeboxConditionalAutoregressive(
+                    input_shape=lyric_enc_input_shape, audio_conditioning=False, metadata_conditioning=False, only_encode=True, **lyric_enc_kwargs
                 )
-                self.prime_state_proj = JukeboxConv1D(self.prime_acts_width, self.prime_state_width)
-                self.prime_state_layer_norm = JukeboxLayerNorm(self.prime_state_width)
-                self.prime_bins = prime_kwargs["bins"]
-                self.prime_x_out = nn.Linear(self.prime_state_width, self.prime_bins, bias=False)
-                nn.init.normal_(self.prime_x_out.weight, std=0.02 * prior_kwargs["init_scale"])
+                self.lyric_encoder_proj_out = JukeboxConv1D(self.lyric_acts_width, self.lyric_enc_width)
+                self.lyric_encoder_layer_norm = JukeboxLayerNorm(self.lyric_enc_width)
+                self.lyric_enc_dim = lyric_enc_kwargs["n_vocab"]
+                self.lyric_encoder.proj_out = nn.Linear(self.lyric_enc_width, self.lyric_enc_dim, bias=False)
+                nn.init.normal_(self.lyric_encoder.proj_out.weight, std=0.02 * prior_kwargs["init_scale"])
             else:
-                self.prime_loss_dims = 0
+                self.lyrics_enc_loss_dims = 0
             self.gen_loss_dims = np.prod(self.music_tokens_shape)
-            self.total_loss_dims = self.prime_loss_dims + self.gen_loss_dims
+            self.total_loss_dims = self.lyrics_enc_loss_dims + self.gen_loss_dims
+            
+            # prior on the tokens
             self.prior = JukeboxConditionalAutoregressive(
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
                 metadata_conditioning=self.metadata_conditioning,
-                encoder_dims=self.prime_loss_dims,
+                encoder_dims=self.lyrics_enc_loss_dims,
                 merged_decoder=config.merged_decoder[-level - 1],
                 **prior_kwargs,
             )
@@ -2239,9 +2261,10 @@ def get_metadata(self, labels, start, total_length, offset, get_indices=False):
             return metadata
 
     def set_metadata_lyric_tokens(self, labels):
-        # assert metadatas.shape[0] == len(labels)
+        """
+            Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens. 
+        """
         if self.n_tokens > 0:
-            # total_length, offset, duration):
             tokens_list = torch.zeros((labels.shape[0], self.n_tokens), dtype=torch.long, device=labels.device)
             indices_list = []  # whats the index of each current character in original array
             for idx in range(labels.shape[0]):
@@ -2256,6 +2279,9 @@ def set_metadata_lyric_tokens(self, labels):
             return labels, None
 
     def get_music_tokens_conds(self, music_tokens, start, end):
+        """
+            Extracts current level's conditioning music tokens. 
+        """
         if self.level != self.levels - 1:
             assert start % self.cond_downsample == end % self.cond_downsample == 0
             music_tokens_cond = music_tokens[self.level + 1][:, start // self.cond_downsample : end // self.cond_downsample]
@@ -2265,71 +2291,95 @@ def get_music_tokens_conds(self, music_tokens, start, end):
             music_tokens_conds = None
         return music_tokens_conds
 
-    def prior_preprocess(self, sampled_audio, conds):
-        N = sampled_audio[0].shape[0]
-        for i in range(len(sampled_audio)):
-            sampled_audio[i] = (sampled_audio[i] + int(self.prior_bins_shift[i])).view(N, -1)
+    def prior_preprocess(self, tokens, conds):
+        """
+            Shifts the input tokens to account for the dictionnary merge. 
+            The prior_embed_dim_shift give by how much. the music tokens should be
+            shifted by + nb_vocab.
+        """
+        N = tokens[0].shape[0]
+        for i in range(len(tokens)):
+            tokens[i] = (tokens[i] + int(self.prior_embed_dim_shift[i])).view(N, -1)
 
         for i in range(len(conds)):
             cond, _, dims = conds[i], self.prior_shapes[i], self.prior_dims[i]
             if cond is None:
-                conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=sampled_audio[0].device)
+                conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=tokens[0].device)
 
-        return torch.cat(sampled_audio, dim=1), torch.cat(conds, dim=1)
+        return torch.cat(tokens, dim=1), torch.cat(conds, dim=1)
 
-    def prior_postprocess(self, music_tokens):
-        N = music_tokens.shape[0]
-        dims = (self.prior_dims[0], music_tokens.shape[1] - self.prior_dims[0])
-        sampled_audio = list(torch.split(music_tokens, dims, dim=1))
+    def prior_postprocess(self, tokens):
+        """
+        Shifts back the input tokens if the model is uses an encoder decoder architecture. 
+        As the embedding layer is shared, prior_embed_dim_shift shifts the music token ids by
+         - nb_vocab. 
+        Returns : only returns the music tokens
+        """
+        N = tokens.shape[0]
+        # dim (nb_lyric_tokens, vqvae_codebook dim = latent_dim of the model)
+        dims = (self.prior_dims[0], tokens.shape[1] - self.prior_dims[0])
+        tokens = list(torch.split(tokens, dims, dim=1))
 
-        for i in range(len(sampled_audio)):
+        # Some of the input tokens might be shifted to take into account the voccabulary fusion
+        for i in range(len(tokens)):
             shape = self.prior_shapes[i]
-            _, bins_shift = int(self.prior_bins[i]), int(self.prior_bins_shift[i])  # bins, -> _,
-            sampled_audio[i] = (sampled_audio[i] - bins_shift).view(N, -1, *shape[1:])
-            sampled_audio[i] = torch.clamp(
-                sampled_audio[i], min=0
+            _, bins_shift = int(self.prior_embed_dim[i]), int(self.prior_embed_dim_shift[i])  # bins, -> _,
+            tokens[i] = (tokens[i] - bins_shift).view(N, -1, *shape[1:])
+            tokens[i] = torch.clamp(
+                tokens[i], min=0
             )  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
 
-        return sampled_audio[-1]
+        return tokens[-1]
     
-    # TODO Rename x_emb
-    def x_emb(self, music_tokens_conds):
+    def embed_tokens(self, music_tokens_conds):
+        """
+            Embeds the upper level music tokens and upsamples them to provide as audio conditioning. 
+        """
         music_tokens_conds = music_tokens_conds[: self.cond_level - self.level]
         audio_conditioning = None
         for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, self.conditioner_blocks))):
             audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning)
         return audio_conditioning
 
-    # should be removed as the vq-vae is no longer part of the prior
-    # def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
-    #     if start_level is None:
-    #         start_level = self.level
-    #     if end_level is None:
-    #         end_level = self.levels
-    #     # Get latents
-    #     with torch.no_grad():
-    #         music_tokens = self.encoder(hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
-    #     return music_tokens
-
-    # same as above, the va-vae is no longer part of the prior
-    # def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
-    #     if start_level is None:
-    #         start_level = self.level
-    #     if end_level is None:
-    #         end_level = self.levels
-    #     with torch.no_grad():
-    #         x_out = self.decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
-    #     return x_out
+    # Used in the forward pass
+    def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
+        """
+            Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states. 
+        """
+        if start_level is None:
+            start_level = self.level
+        if end_level is None:
+            end_level = self.levels
+        # Get latents
+        with torch.no_grad():
+            latent_states = self.encoder(hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+        return latent_states
+
+    
+    def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
+        """
+            Usamples the sequence of codebook vectors  to a raw audio.  
+        """
+        if start_level is None:
+            start_level = self.level
+        if end_level is None:
+            end_level = self.levels
+        with torch.no_grad():
+            output = self.decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+        return output
 
     def get_cond(self, music_tokens_conds, metadata):
+        """
+            Converts the tokens to the input_embeddings. Splits the lyrics and the metadata. Lyric tokens can be None
+        """
         if metadata is not None:
             n_labels = metadata.shape[1] - self.n_tokens
-            metadata, prime = metadata[:, :n_labels], metadata[:, n_labels:]
+            metadata, lyric_tokens = metadata[:, :n_labels], metadata[:, n_labels:]
         else:
-            metadata, prime = None, None
+            metadata, lyric_tokens = None, None
         metadata_conditioning, metadata_pos = self.metadata_embedding(metadata) if self.metadata_conditioning else (None, None)
-        audio_conditioning = self.x_emb(music_tokens_conds) if self.audio_conditioning else metadata_pos
-        return audio_conditioning, metadata_conditioning, prime
+        audio_conditioning = self.embed_tokens(music_tokens_conds) if self.audio_conditioning else metadata_pos
+        return audio_conditioning, metadata_conditioning, lyric_tokens
 
     def sample(
         self,
@@ -2344,23 +2394,24 @@ def sample(
         chunk_size=None,
         sample_tokens=None,
     ):
+        """
+            Ancestral sampling a window of tokens using the provided conditioning and metadatas
+        """
         no_past_context = music_tokens is None or music_tokens.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
         print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
 
         with torch.no_grad():
             # Currently audio_conditioning only uses immediately above layer
-            # TODO Rename prime
-            audio_conditioning, metadata_conditioning, prime = self.get_cond(music_tokens_conds, metadata)
+            audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
             if self.single_enc_dec:
                 if no_past_context:
-                    # TODO Rename prime
-                    music_tokens, audio_conditioning = self.prior_preprocess([prime], [None, audio_conditioning])
+                    music_tokens, audio_conditioning = self.prior_preprocess([lyric_tokens], [None, audio_conditioning])
                 else:
-                    music_tokens, audio_conditioning = self.prior_preprocess([prime, music_tokens], [None, audio_conditioning])
+                    music_tokens, audio_conditioning = self.prior_preprocess([lyric_tokens, music_tokens], [None, audio_conditioning])
                 if sample_tokens is not None:
                     sample_tokens += self.n_tokens
-                music_tokens = self.prior.primed_sample(
+                tokens = self.prior.primed_sample(
                     n_samples,
                     music_tokens,
                     audio_conditioning,
@@ -2372,15 +2423,15 @@ def sample(
                     chunk_size=chunk_size,
                     sample_tokens=sample_tokens,
                 )
-                music_tokens = self.prior_postprocess(music_tokens)
+                music_tokens = self.prior_postprocess(tokens)
             else:
-                encoder_key_value = self.get_encoder_key_value(prime, fp16=fp16, sample=True)
+                lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens, fp16=fp16, sample=True)
                 if no_past_context:
                     music_tokens = self.prior.sample(
                         n_samples,
                         audio_conditioning,
                         metadata_conditioning,
-                        encoder_key_value,
+                        lyric_encoder_states,
                         fp16=fp16,
                         temp=temp,
                         top_k=top_k,
@@ -2393,7 +2444,7 @@ def sample(
                         music_tokens,
                         audio_conditioning,
                         metadata_conditioning,
-                        encoder_key_value,
+                        lyric_encoder_states,
                         fp16=fp16,
                         temp=temp,
                         top_k=top_k,
@@ -2403,54 +2454,65 @@ def sample(
                     )
         return music_tokens
 
-    def get_encoder_key_value(self, prime, fp16=False, sample=False):
-        # TODO Rename prime
+    def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
+        """
+            Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. 
+            Forwards through the lyric encoder. 
+        """
         if self.n_tokens != 0 and self.use_tokens:
             if sample:
-                self.prime_prior = self.prime_prior.to(prime.device)
-            prime_acts = self.prime_prior(prime, None, None, None, fp16=fp16)
-            encoder_key_value = self.prime_state_layer_norm(self.prime_state_proj(prime_acts))
+                self.lyric_encoder = self.lyric_encoder.to(lyric_tokens.device)
+            lyric_acts = self.lyric_encoder(lyric_tokens, None, None, None, fp16=fp16)
+            lyric_acts = self.lyric_encoder_proj_out(lyric_acts)
+            lyric_encoder_states = self.lyric_encoder_layer_norm(lyric_acts)
             if sample:
-                self.prime_prior.cpu()
+                self.lyric_encoder.cpu()
                 if fp16:
-                    encoder_key_value = encoder_key_value.half()
+                    lyric_encoder_states = lyric_encoder_states.half()
         else:
-            encoder_key_value = None
-        return encoder_key_value
+            lyric_encoder_states = None
+        return lyric_encoder_states
 
-    def get_prime_loss(self, encoder_key_value, prime_t):
+    def get_lyric_enc_loss(self, lyric_encoder_states, target_lyrics):
+        """
+            Computes the loss for the lyric encoder, next token prediction.
+        """
         if self.use_tokens:
-            encoder_key_value = encoder_key_value.float()
-            encoder_key_value = self.prime_x_out(encoder_key_value)
-            prime_loss = nn.functional.cross_entropy(encoder_key_value.view(-1, self.prime_bins), prime_t.view(-1)) / np.log(
+            lyric_encoder_states = lyric_encoder_states.float()
+            lyric_encoder_states = self.lyric_encoder.proj_out(lyric_encoder_states)
+            lyric_enc_loss = nn.functional.cross_entropy(lyric_encoder_states.view(-1, self.lyric_enc_dim), target_lyrics.view(-1)) / np.log(
                 2.0
             )
         else:
-            prime_loss = torch.tensor(0.0, device="cuda")
-        return prime_loss
+            lyric_enc_loss = torch.tensor(0.0, device="cuda")
+        return lyric_enc_loss
 
-    def music_tokens_forward(self, music_tokens, music_tokens_conds=[], metadata=None, fp16=False, get_preds=False, get_attn_weights=False):
+    def forward_tokens(self, music_tokens, music_tokens_conds=[], metadata=None, fp16=False, get_preds=False, get_attn_weights=False):
         """
-        Arguments:
+        Applies a forward pass using the conditioning tokens. Different from the classif forward as 
+        it does not use the vqvae's encoding layers. 
+            
+        Args:
             get_attn_weights (bool or set): Makes forward prop dump
-                self-attention softmaxes to self.prior.transformer.ws. Either a set of layer indices indicating which
+                self-attention softmaxes to self.prior.transformer.saved_attn_weights. Either a set of layer indices indicating which
                 layers to store, or a boolean value indicating whether to dump all.
         """
         if get_attn_weights:
             self.prior.transformer.set_record_attn(get_attn_weights)
-        audio_conditioning, metadata_conditioning, prime = self.get_cond(music_tokens_conds, metadata)
+        audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
         if self.copy_input:
-            prime = music_tokens[:, : self.n_tokens]
-        if self.single_enc_dec:
-            music_tokens, audio_conditioning = self.prior_preprocess([prime, music_tokens], [None, audio_conditioning])
+            lyric_tokens = music_tokens[:, : self.n_tokens]
+            
+        if self.single_enc_dec: # the preprocess returns the full tokens, shifted
+            tokens, audio_conditioning = self.prior_preprocess([lyric_tokens, music_tokens], [None, audio_conditioning])
             (prime_loss, gen_loss), preds = self.prior(
-                music_tokens, audio_conditioning, metadata_conditioning, fp16=fp16, get_sep_loss=True, get_preds=get_preds
+                tokens, audio_conditioning, metadata_conditioning, fp16=fp16, get_sep_loss=True, get_preds=get_preds
             )
         else:
-            encoder_key_value = self.get_encoder_key_value(prime, fp16=fp16)
-            prime_loss = self.get_prime_loss(encoder_key_value, prime)
-            gen_loss, preds = self.prior(music_tokens, audio_conditioning, metadata_conditioning, encoder_key_value, fp16=fp16, get_preds=get_preds)
-        loss = (self.prime_loss_fraction * prime_loss * self.prime_loss_dims / self.total_loss_dims) + (
+            lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens, fp16=fp16)
+            prime_loss = self.get_lyric_enc_loss(lyric_encoder_states, lyric_tokens)
+            gen_loss, preds = self.prior(music_tokens, audio_conditioning, metadata_conditioning, lyric_encoder_states, fp16=fp16, get_preds=get_preds)
+        loss = (self.prime_loss_fraction * prime_loss * self.lyrics_enc_loss_dims / self.total_loss_dims) + (
             gen_loss * self.gen_loss_dims / self.total_loss_dims
         )
         metrics = dict(
@@ -2459,17 +2521,16 @@ def music_tokens_forward(self, music_tokens, music_tokens_conds=[], metadata=Non
         if get_preds:
             metrics["preds"] = preds.clone().detach()
         if get_attn_weights:
-             # TODO Rename ws to something more meaningful
-            ws = self.prior.transformer.ws
+            saved_attn_weights = self.prior.transformer.saved_attn_weights
             self.prior.transformer.set_record_attn(False)
-            return ws
+            return saved_attn_weights
         else:
             return loss, metrics
 
     def forward(self, hidden_states, metadata=None, fp16=False, decode=False, get_preds=False):
         batch_size = hidden_states.shape[0]
         music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size)
-        loss, metrics = self.music_tokens_forward(music_tokens=music_tokens, music_tokens_conds=music_tokens_conds, metadata= metadata, fp16=fp16, get_preds=get_preds)
+        loss, metrics = self.forward_tokens(music_tokens=music_tokens, music_tokens_conds=music_tokens_conds, metadata= metadata, fp16=fp16, get_preds=get_preds)
         if decode:
             dequantised_states = self.decode([music_tokens, *music_tokens_conds])
         else:
@@ -2541,8 +2602,11 @@ def get_starts(total_length, n_ctx, hop_length):
     return starts
 
 
-# TODO fix this, consumes too much RAM so should probably be removed
+# FIXME, consumes too much RAM so should probably be removed
 def get_alignment(music_tokens, labels, prior, level, fp16, hps):
+    """
+    Should compute the lyric to music token alignment, but for now it cannot be used. 
+    """
     level = level - 1  # Top level used
     n_ctx = prior.n_ctx
     tokens = music_tokens[level]
@@ -2573,7 +2637,7 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
         metadata_bs = torch.chunk(metadata, batch_size, dim=0)
         w_hops = []
         for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
-            w_hop = prior.music_tokens_forward(tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers)
+            w_hop = prior.forward_tokens(tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers)
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
         w = torch.cat(w_hops, dim=0)
@@ -2704,11 +2768,11 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
             # Nothing new to sample
             return music_tokens
 
-        # get music_tokens_conds from level above TODO rename to latent_cond? Or music_token_cond
+        # get music_tokens_conds from level above
         music_tokens_conds = prior.get_music_tokens_conds(music_tokens, start, end)
         # if there are no levels above should return None!
 
-        # set metadata offset, sample_length and lyrics okens TODO rename to lyric_cond
+        # set metadata offset, sample_length and lyrics tokens 
         metadata = prior.get_metadata(labels, start, self.config.sample_length, offset)
 
         empty_cache()

From 32949c11963f0831993b7bac4fb1de4849249313 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 9 Aug 2022 18:26:35 +0000
Subject: [PATCH 084/196] style

---
 .../models/jukebox/configuration_jukebox.py   |   2 +-
 .../models/jukebox/convert_jukebox.py         | 104 +++---
 .../models/jukebox/modeling_jukebox.py        | 326 ++++++++++++------
 3 files changed, 256 insertions(+), 176 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index bab46ce84dc7e..13e10e35e7be2 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -176,7 +176,7 @@ def __init__(
         cond_zero_out=False,
         # args for the priors, 3 priors
         n_ctx=(8192, 8192, 8192),
-        t_bins=128, # TODO rename to timing_embed_dim
+        t_bins=128,  # TODO rename to timing_embed_dim
         downs_t=(3, 2, 2),
         strides_t=(2, 2, 2),
         single_enc_dec=[True, False, False],
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 281c48768df24..8e39a07e7db07 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -16,6 +16,7 @@
 
 
 import argparse
+import os
 from pathlib import Path
 
 import torch
@@ -23,7 +24,7 @@
 import requests
 from transformers import JukeboxConfig, JukeboxModel
 from transformers.utils import logging
-import os
+
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -54,33 +55,35 @@ def rename_key(dct, old, new):
     ],
 }
 
-def replace_key(key) : 
-    if ".k." in key:   # replace vqvae.X.k with vqvae.X.codebook
+
+def replace_key(key):
+    if ".k." in key:  # replace vqvae.X.k with vqvae.X.codebook
         return key.replace(".k.", ".codebook.")
     elif ".y_emb." in key:
-            key = key.replace(".y_emb.", ".metadata_embedding.")
-      
-          
-# TODO right a clean conversion code using regex or replace 
-# depending on the most appropriate choice            
+        key = key.replace(".y_emb.", ".metadata_embedding.")
+
+
+# TODO right a clean conversion code using regex or replace
+# depending on the most appropriate choice
 def fix_jukebox_keys(state_dict, model_state_dict):
     new_dict = {}
-    model_unformatted_keys = {".".join(k.split('.')[2:]) for k in model_state_dict.keys()}
-    import re 
-    model_to_conv = {1:"conv1d_1", 3:"conv1d_2"}
+    model_unformatted_keys = {".".join(k.split(".")[2:]) for k in model_state_dict.keys()}
+    import re
+
+    model_to_conv = {1: "conv1d_1", 3: "conv1d_2"}
     re_cond_block = re.compile("conditioner_blocks.([\d]).cond.model.([\d]).([\d]).model.([\d])")
     groups = re_cond_block.match(original_key).groups()
     block_index = int(groups[0]) * 2 + int(groups[1])
     re_new_key = f"conditioner_blocks.{groups[0]}.upsampler.upsample_block.{block_index}.resnet_block.{model_to_conv[groups[-1]]}"
-    
-    re_cond_block.sub(re_new_key,original_key)
-    
+
+    re_cond_block.sub(re_new_key, original_key)
+
     for original_key, value in state_dict.items():
         key = original_key
-        
+
         if ".k." in key:
             key = key.replace(".k.", ".codebook.")
-        
+
         elif ".y_emb." in key:
             key = key.replace(".y_emb.", ".metadata_embedding.")
         else:
@@ -89,49 +92,23 @@ def fix_jukebox_keys(state_dict, model_state_dict):
                 if len(wo_model[1].split(".")) <= 3:
                     key = wo_model[0] + "proj_out." + wo_model[1].split(".")[-1]
                 else:
-                    block_index = str(
-                        int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2])
-                    )
-                    key = (
-                        wo_model[0]
-                        + "downsample_block."
-                        + block_index
-                        + "."
-                        + wo_model[1].split(".")[-1]
-                    )
+                    block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]))
+                    key = wo_model[0] + "downsample_block." + block_index + "." + wo_model[1].split(".")[-1]
             elif len(wo_model) == 2 and "decoders" in key:
                 if len(wo_model[1].split(".")) <= 3:
                     key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
                 else:
-                    block_index = str(
-                        int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
-                    )
-                    key = (
-                        wo_model[0]
-                        + "upsample_block."
-                        + block_index
-                        + "."
-                        + wo_model[1].split(".")[-1]
-                    )
+                    block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
+                    key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
             elif len(wo_model) == 2 and "cond.model." in key:
                 if len(wo_model[1].split(".")) <= 3:
                     key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
                 else:
-                    block_index = str(
-                        int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
-                    )
-                    key = (
-                        wo_model[0]
-                        + "upsample_block."
-                        + block_index
-                        + "."
-                        + wo_model[1].split(".")[-1]
-                    )
+                    block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
+                    key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
             elif len(wo_model) == 3 and "priors" in key:
                 # should also rename cond to low_lvl_conditioner
-                block_index = str(
-                    int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
-                )
+                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
                 key = (
                     wo_model[0]
                     + "upsample_block."
@@ -148,9 +125,7 @@ def fix_jukebox_keys(state_dict, model_state_dict):
                 # vqvae.decoders.0.level_blocks.0.model.1.0.model.1.model.3.bias
                 # to
                 # vqvae.decoders.1.level_blocks.0.upsample_block.1.resnet_blocks.2.conv1d_2.weight
-                block_index = str(
-                    int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2
-                )
+                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
                 key = (
                     wo_model[0]
                     + "upsample_block."
@@ -180,24 +155,24 @@ def fix_jukebox_keys(state_dict, model_state_dict):
                 key = key.replace(".model.3.bias", ".conv1d_2.bias")
             elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
                 key = key.replace(".model.3.weight", ".conv1d_2.weight")
-                
-        if ".cond." in key : 
+
+        if ".cond." in key:
             key = key.replace(".cond.", ".upsampler.")
-        if ".ln" in key : 
+        if ".ln" in key:
             key = key.replace(".ln", ".layer_norm")
-        if "_ln" in key : 
+        if "_ln" in key:
             key = key.replace("_ln", "_layer_norm")
         if "prime_prior" in key:
-            key = key.replace("prime_prior","lyric_encoder")
+            key = key.replace("prime_prior", "lyric_encoder")
         if "prime_x_out" in key:
-            key = key.replace("prime_x_out","lyric_enc_proj_out")
+            key = key.replace("prime_x_out", "lyric_enc_proj_out")
         # if "x_emb" in key:
         #     key = key.replace("x_emb","lyric_enc_proj_out")
         if not "conditioner_blocks" in key and "x_emb" in key:
-            key = key.replace("x_emb","lyric_enc.proj_out")
+            key = key.replace("x_emb", "lyric_enc.proj_out")
         if key not in model_unformatted_keys:
             print(f"failed converting {original_key} to {key}, does not match")
-        
+
         # elif value.shape != model_state_dict[key].shape:
         #     print(
         #         f"{original_key}-> {key} : \nshape {model_unformatted_keys[key].shape} and"
@@ -216,7 +191,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     for file in MODEL_MAPPING[model_name]:
         if not os.path.isfile(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}"):
             r = requests.get(f"{PREFIX}{file}", allow_redirects=True)
-            os.makedirs(f"{pytorch_dump_folder_path}/",exist_ok=True)
+            os.makedirs(f"{pytorch_dump_folder_path}/", exist_ok=True)
             open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content)
 
     vqvae, *priors = MODEL_MAPPING[model_name.split("/")[-1]]
@@ -224,7 +199,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         "model"
     ]
 
-    config = JukeboxConfig.from_pretrained("ArthurZ/"+model_name)
+    config = JukeboxConfig.from_pretrained("ArthurZ/" + model_name)
     model = JukeboxModel(config)
 
     weight_dict = []
@@ -266,7 +241,10 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         help="Name of the model you'd like to convert.",
     )
     parser.add_argument(
-        "--pytorch_dump_folder_path", default="converted_model", type=str, help="Path to the output PyTorch model directory."
+        "--pytorch_dump_folder_path",
+        default="converted_model",
+        type=str,
+        help="Path to the output PyTorch model directory.",
     )
     args = parser.parse_args()
     convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 147652d8ffcfb..73eb8c58424e1 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -66,7 +66,10 @@ def empty_cache():
 
 def get_range(list):
     return tqdm(
-        list, leave=True, file=sys.stdout, bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
+        list,
+        leave=True,
+        file=sys.stdout,
+        bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]",
     )
 
 
@@ -98,7 +101,9 @@ def __init__(self, n_in, n_out, zero_out=False):
     def forward(self, hidden_states):
         size_out = (*hidden_states.size()[:-1], self.n_out)
         hidden_states = torch.addmm(
-            self.bias.type_as(hidden_states), hidden_states.view(-1, hidden_states.size(-1)), self.weight.type_as(hidden_states)
+            self.bias.type_as(hidden_states),
+            hidden_states.view(-1, hidden_states.size(-1)),
+            self.weight.type_as(hidden_states),
         )  # If hidden_states if float then float else half
         hidden_states = hidden_states.view(*size_out)
         return hidden_states
@@ -308,7 +313,7 @@ def level_block(level, down_t, stride_t):
         for level, down_t, stride_t in iterator:
             self.level_blocks.append(level_block(level, down_t, stride_t))
 
-        #TODO rename to proj out
+        # TODO rename to proj out
         self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
 
     def forward(self, hidden_states, all_levels=True):
@@ -512,7 +517,9 @@ def __init__(self, codebook_dim, codebook_width, mu, levels):
             self.level_blocks.append(JukeboxBottleneckBlock(codebook_dim, codebook_width, mu))
 
     def encode(self, raw_audio):
-        music_tokens = [level_block.encode(hidden_states) for (level_block, hidden_states) in zip(self.level_blocks, raw_audio)]
+        music_tokens = [
+            level_block.encode(hidden_states) for (level_block, hidden_states) in zip(self.level_blocks, raw_audio)
+        ]
         return music_tokens
 
     def decode(self, music_tokens, start_level=0, end_level=None):
@@ -675,7 +682,8 @@ def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
 
     def sample(self, n_samples):
         music_tokens = [
-            torch.randint(0, self.codebook_dim, size=(n_samples, *music_tokens_shape), device="cpu") for music_tokens_shape in self.music_tokens_shapes
+            torch.randint(0, self.codebook_dim, size=(n_samples, *music_tokens_shape), device="cpu")
+            for music_tokens_shape in self.music_tokens_shapes
         ]
         return self.decode(music_tokens)
 
@@ -741,7 +749,9 @@ def repeat(hidden_states, n_repeat, dim):
     if dim == -1:
         dim = len(hidden_states.shape) - 1
     return (
-        hidden_states.view(int(np.prod(hidden_states.shape[: dim + 1])), 1, int(np.prod(hidden_states.shape[dim + 1 :])))
+        hidden_states.view(
+            int(np.prod(hidden_states.shape[: dim + 1])), 1, int(np.prod(hidden_states.shape[dim + 1 :]))
+        )
         .repeat(1, n_repeat, 1)
         .view(*hidden_states.shape[:dim], n_repeat * hidden_states.shape[dim], *hidden_states.shape[dim + 1 :])
     )
@@ -759,7 +769,9 @@ def get_mask(mask, query_length, key_value_length, blocks, spread, device, sampl
         # Masked summary
         mask = (
             torch.nn.functional.pad(
-                torch.ones(query_length, query_length, device=device).tril().view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :],
+                torch.ones(query_length, query_length, device=device)
+                .tril()
+                .view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :],
                 (0, 0, 1, 0),
                 value=1,
             )
@@ -862,7 +874,7 @@ def _attn(self, query_states, key_states, value_states, sample):
                 attention_weight = attention_weight * mask + -1e9 * (1 - mask)
         attention_prob = F.softmax(attention_weight, dim=-1).type(attn_weight_type)
         if self.record_attn:
-            self.attention_prob = attention_prob 
+            self.attention_prob = attention_prob
             if self.attn_func == 7:
                 # only keep music queries and lyrics keys/values
                 self.attention_prob = self.attention_prob[:, :, self.prime_len :, : self.prime_len]
@@ -985,8 +997,16 @@ def prev_block_attn(self, query, key, value, sample):
                 qb = query_length // block_ctx
                 kb = seq_len // block_ctx
                 seq_len = query_length
-                key = key.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:].contiguous().view(batch_size * qb, block_ctx, embed_dim)
-                value = value.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:].contiguous().view(batch_size * qb, block_ctx, embed_dim)
+                key = (
+                    key.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:]
+                    .contiguous()
+                    .view(batch_size * qb, block_ctx, embed_dim)
+                )
+                value = (
+                    value.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:]
+                    .contiguous()
+                    .view(batch_size * qb, block_ctx, embed_dim)
+                )
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def summary_attn(self, query, key, value, sample):
@@ -997,7 +1017,9 @@ def summary_attn(self, query, key, value, sample):
         batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
         if sample:
             key = torch.nn.functional.pad(key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
-            value = torch.nn.functional.pad(value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
+            value = torch.nn.functional.pad(
+                value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0)
+            )
             return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             key = torch.nn.functional.pad(
@@ -1022,12 +1044,17 @@ def summary_spread_attn(self, query, key, value, sample):
             # return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             key = (
-                torch.nn.functional.pad(key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
+                torch.nn.functional.pad(
+                    key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0)
+                )
                 .contiguous()
                 .view(batch_size, blocks * spread, embed_dim)
             )  # batch_size, blocks * spread, embed_dim
             value = (
-                torch.nn.functional.pad(value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0))
+                torch.nn.functional.pad(
+                    value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :],
+                    (0, 0, 0, 0, 1, 0),
+                )
                 .contiguous()
                 .view(batch_size, blocks * spread, embed_dim)
             )  # batch_size, blocks * spread, embed_dim
@@ -1084,7 +1111,9 @@ def decode_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         query = hidden_states
         if sample:
             if self.sample_t == 0:
-                self.cache["key"], self.cache["value"] = self.c_enc_kv(lyric_encoder_states.type_as(hidden_states)).chunk(2, dim=2)
+                self.cache["key"], self.cache["value"] = self.c_enc_kv(
+                    lyric_encoder_states.type_as(hidden_states)
+                ).chunk(2, dim=2)
             key, value = self.cache["key"], self.cache["value"]
             self.sample_t += curr_ctx
         else:
@@ -1360,15 +1389,15 @@ def attn_block(d):
         self._attn_mods = nn.ModuleList()
         for d in range(n_depth):
             self._attn_mods.append(attn_block(d))
-            
+
         self.saved_attn_weights = []
 
     def set_record_attn(self, record_attn):
         """
         Arguments:
             record_attn (bool or set): Makes forward prop dump self-attention
-                softmaxes to self.saved_attn_weights. Either a set of layer indices indicating which layers to store, or a boolean
-                value indicating whether to dump all.
+                softmaxes to self.saved_attn_weights. Either a set of layer indices indicating which layers to store,
+                or a boolean value indicating whether to dump all.
         """
 
         def _should_record_attn(layer_idx):
@@ -1393,7 +1422,7 @@ def forward(self, hidden_states, lyric_encoder_states=None, sample=False, fp16=F
 
         # Blocks
         for i, attn_layer in enumerate(self._attn_mods):
-            if attn_layer.attn_func == 6: # attend to the lyrics
+            if attn_layer.attn_func == 6:  # attend to the lyrics
                 hidden_states = attn_layer(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=sample)
             else:
                 hidden_states = attn_layer(hidden_states, lyric_encoder_states=None, sample=sample)
@@ -1461,19 +1490,21 @@ def __init__(
         prime_len=None,
     ):
         """
-            - input_shape : respective dimension of the different inputs (lyrics/music_tokens) 
-            - embed_dim : either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook
-            dimension, if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder 
-            for the lyric tokens.
-            - encoder_dims : input dimension of the lyric encoder. 
-            - audio_conditioning : whether or not the prior supports conditionning on audio. 
-            - metadata_conditioning : whether or not the prior supports conditionning on artitst, genres, lyrics and timing. When 
-            False, the start token is random.  
+        - input_shape : respective dimension of the different inputs (lyrics/music_tokens)
+        - embed_dim : either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook
+        dimension, if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
+        for the lyric tokens.
+        - encoder_dims : input dimension of the lyric encoder.
+        - audio_conditioning : whether or not the prior supports conditionning on audio.
+        - metadata_conditioning : whether or not the prior supports conditionning on artitst, genres, lyrics and
+          timing. When
+        False, the start token is random.
+        - prime_len : for now ??????
         """
         super().__init__()
         self.input_shape = input_shape
         self.input_dims = input_dims = np.prod(input_shape)
-        self.encoder_dims = encoder_dims 
+        self.encoder_dims = encoder_dims
         self.embed_dim = embed_dim
         self.width = width
         self.depth = depth
@@ -1561,7 +1592,7 @@ def forward(
         get_sep_loss=False,
     ):
         """
-            - tokens : composed of both music tokens and lyrics tokens or just music tokens 
+        - tokens : composed of both music tokens and lyrics tokens or just music tokens
         """
         # Preprocess.
         with torch.no_grad():
@@ -1579,9 +1610,13 @@ def forward(
         else:
             hidden_states[:, 0] = self.start_token
 
-        hidden_states = self.embed_tokens_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning  # Pos emb and dropout
+        hidden_states = (
+            self.embed_tokens_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning
+        )  # Pos emb and dropout
 
-        hidden_states = self.transformer(hidden_states, lyric_encoder_states=lyric_encoder_states, fp16=fp16)  # Transformer
+        hidden_states = self.transformer(
+            hidden_states, lyric_encoder_states=lyric_encoder_states, fp16=fp16
+        )  # Transformer
         if self.add_cond_after_transformer:  # Piped doesnt add x_cond
             hidden_states = hidden_states + audio_conditioning
 
@@ -1624,7 +1659,9 @@ def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_cond
             cond = audio_conditioning[:, sample_t : sample_t + 1, :]
         else:
             cond = audio_conditioning
-        hidden_states = hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond  # Pos emb, dropout is identity at eval time
+        hidden_states = (
+            hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond
+        )  # Pos emb, dropout is identity at eval time
         return hidden_states, cond
 
     def sample(
@@ -1655,7 +1692,9 @@ def sample(
                 preds = []
 
             for sample_t in get_range(range(0, sample_tokens)):
-                hidden_states, cond = self.get_emb(sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning)
+                hidden_states, cond = self.get_emb(
+                    sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning
+                )
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 hidden_states = self.transformer(
                     hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16
@@ -1668,7 +1707,9 @@ def sample(
                 # Adjust logits
                 hidden_states = hidden_states / temp
                 hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
-                tokens = torch.distributions.Categorical(logits=hidden_states).sample()  # Sample and replace hidden_states
+                tokens = torch.distributions.Categorical(
+                    logits=hidden_states
+                ).sample()  # Sample and replace hidden_states
                 sampled_tokens.append(tokens.clone())
             del tokens
             self.transformer.del_cache()
@@ -1729,7 +1770,9 @@ def primed_sample(
                 sampled_audio_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
                     # TODO rename x_prime, con_prime
-                    x_prime, cond_prime = self.get_emb(sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning)
+                    x_prime, cond_prime = self.get_emb(
+                        sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning
+                    )
                     hidden_states = sampled_audio[sample_t]
                     sampled_audio_prime.append(x_prime)
                     conds_prime.append(cond_prime)
@@ -1762,9 +1805,13 @@ def primed_sample(
             empty_cache()
 
             for sample_t in get_range(range(len(sampled_audio), sample_tokens)):
-                hidden_states, cond = self.get_emb(sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning)
+                hidden_states, cond = self.get_emb(
+                    sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning
+                )
                 self.transformer.check_cache(n_samples, sample_t, fp16)
-                hidden_states = self.transformer(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16)  # Transformer
+                hidden_states = self.transformer(
+                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16
+                )  # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
                 hidden_states = self.fc_proj_out(hidden_states)  # Predictions
@@ -1773,7 +1820,9 @@ def primed_sample(
                 # Adjust logits
                 hidden_states = hidden_states / temp
                 hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
-                hidden_states = torch.distributions.Categorical(logits=hidden_states).sample()  # Sample and replace hidden_states
+                hidden_states = torch.distributions.Categorical(
+                    logits=hidden_states
+                ).sample()  # Sample and replace hidden_states
                 assert hidden_states.shape == (n_samples, 1)
                 sampled_audio.append(hidden_states.clone())
 
@@ -1850,13 +1899,14 @@ class MusicTokenConditioner(nn.Module):
     The embedding layer is different from the vaqvae's bottleneck
 
     """
-    # TODO check why embed_dim is initialized to config.latent_dim which is 2048 = to codebook_di. is it 
-    # latent_dim? 
+
+    # TODO check why embed_dim is initialized to config.latent_dim which is 2048 = to codebook_di. is it
+    # latent_dim?
     def __init__(
         self, input_shape, embed_dim, down_t, stride_t, out_width, init_scale, zero_out, res_scale, **block_kwargs
     ):
         super().__init__()
-        # self.x_shape = input_shape # is this needed? 
+        # self.x_shape = input_shape # is this needed?
         self.width = out_width
         self.embed_tokens = nn.Embedding(embed_dim, out_width)
         nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
@@ -1962,7 +2012,9 @@ def forward(self, pos_start, pos_end=None):
 
         # Bin each value to bins_
         normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min)  # [0,1)
-        bins_ = (self.embed_dim * normalised_position).floor().long().detach()  # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1]
+        bins_ = (
+            (self.embed_dim * normalised_position).floor().long().detach()
+        )  # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1]
         return self.emb(bins_)
 
 
@@ -2004,7 +2056,13 @@ def __init__(
             )
 
     def forward(self, metadata):
-        total_length, offset, length, artist, genre = metadata[:, 0:1], metadata[:, 1:2], metadata[:, 2:3], metadata[:, 3:4], metadata[:, 4:]
+        total_length, offset, length, artist, genre = (
+            metadata[:, 0:1],
+            metadata[:, 1:2],
+            metadata[:, 2:3],
+            metadata[:, 3:4],
+            metadata[:, 4:],
+        )
         # Start embedding of length 1
         artist_emb = self.artist_emb(artist)
         # Empty genre slots are denoted by -1. We mask these out.
@@ -2027,7 +2085,6 @@ def forward(self, metadata):
         return start_emb, pos_emb
 
 
-
 class JukeboxPrior(nn.Module):
     """
     Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
@@ -2039,20 +2096,22 @@ class JukeboxPrior(nn.Module):
     - Single Encoder Decoder: This is a simplification where we combine them into a single model. We merge the text
       vocab
     and VQ vocab into a single large vocab, and the lyric tokens and VQ tokens into a single longer sequence of tokens
-    which we autoregressively model together. # TODO this explains the input bins that are different from the lower lvl transformers. 
-    
-    Question : why are the embeddings from the vq-vae not used? Or am I crazy? In the forward it is used, but not in the primed sample
-    or sample functions. If the model is not trained using these/ uses the forward differently then I guess it is fine but otherwise it looks strange.
+    which we autoregressively model together. # TODO this explains the input bins that are different from the lower lvl
+    transformers.
+
+    Question : why are the embeddings from the vq-vae not used? Or am I crazy? In the forward it is used, but not in
+    the primed sample or sample functions. If the model is not trained using these/ uses the forward differently then I
+    guess it is fine but otherwise it looks strange.
     """
 
-    def __init__(self, config, level, encoder = None, decoder = None):
+    def __init__(self, config, level, encoder=None, decoder=None):
         super().__init__()
-        
-        # Passing functions instead of the vqvae module to avoid getting params, only used in the 
-        # forward loop 
+
+        # Passing functions instead of the vqvae module to avoid getting params, only used in the
+        # forward loop
         self.encoder = encoder
         self.decoder = decoder
-        
+
         vqvae_music_tokens_shapes = config.vqvae_music_tokens_shapes
 
         def rescale(music_tokens_shape):
@@ -2061,8 +2120,8 @@ def rescale(music_tokens_shape):
         music_tokens_shapes = [rescale(music_tokens_shape) for music_tokens_shape in vqvae_music_tokens_shapes]
         self.use_tokens = config.use_tokens[-level - 1]
         self.n_tokens = config.n_tokens[-level - 1]
-        
-        # TODO rename prime loss fraction 
+
+        # TODO rename prime loss fraction
         self.prime_loss_fraction = config.prime_loss_fraction[-level - 1]
 
         self.copy_input = config.copy_input
@@ -2075,7 +2134,7 @@ def rescale(music_tokens_shape):
         self.music_tokens_shape = self.music_tokens_shapes[level]
 
         self.level = level
-        
+
         self.latent_dim = config.latent_dim
 
         prior_kwargs = dict(
@@ -2083,7 +2142,7 @@ def rescale(music_tokens_shape):
             embed_dim=config.latent_dim,
             width=config.width[-level - 1],
             depth=config.depth[-level - 1],
-            heads=config.n_heads[-level - 1], # TODO Rename in config
+            heads=config.n_heads[-level - 1],  # TODO Rename in config
             attn_order=config.attn_order[-level - 1],
             blocks=config.blocks,
             spread=config.spread,
@@ -2098,7 +2157,7 @@ def rescale(music_tokens_shape):
         )
 
         if config.use_tokens and not config.single_enc_dec[-level - 1]:
-            # TODO rename to encoder_kwargs as they are used both 
+            # TODO rename to encoder_kwargs as they are used both
             # when single and not
             lyric_enc_kwargs = dict(
                 n_vocab=config.n_vocab,
@@ -2137,8 +2196,8 @@ def rescale(music_tokens_shape):
         metadata_conditioning_kwargs = dict(
             out_width=config.width[-level - 1],
             init_scale=config.init_scale[-level - 1],
-            metadata_dims=config.y_bins[-level - 1], # rename to metadata_dims
-            timing_dims=config.t_bins, # rename to timing_dims or timing_intervals
+            metadata_dims=config.y_bins[-level - 1],  # rename to metadata_dims
+            timing_dims=config.t_bins,  # rename to timing_dims or timing_intervals
             sr=config.sr,
             min_duration=config.min_duration,
             max_duration=config.max_duration,
@@ -2150,7 +2209,7 @@ def rescale(music_tokens_shape):
         self.cond_level = level + 1
 
         # metadata conditioning
-        self.metadata_conditioning = config.labels # TODO change config
+        self.metadata_conditioning = config.labels  # TODO change config
 
         self.single_enc_dec = config.single_enc_dec[-level - 1]
         # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both)
@@ -2160,7 +2219,7 @@ def rescale(music_tokens_shape):
             def conditioner_block(_level):
                 return MusicTokenConditioner(
                     input_shape=music_tokens_shapes[_level],
-                    embed_dim=config.latent_dim,        # TODO should we remove in favor of the vqvae dim? Maybe not
+                    embed_dim=config.latent_dim,  # TODO should we remove in favor of the vqvae dim? Maybe not
                     down_t=config.downs_t[_level],
                     stride_t=config.strides_t[_level],
                     **audio_conditioning_kwargs,
@@ -2172,7 +2231,9 @@ def conditioner_block(_level):
         # metadata conditioning : contioning on timing, genres, and artist
         if self.metadata_conditioning:
             self.n_time = self.music_tokens_shape[0]  # Assuming STFT=TF order and raw=T1 order, so T is first dim
-            self.metadata_embedding = LabelConditioner(n_time=self.n_time, include_time_signal=not self.audio_conditioning, **metadata_conditioning_kwargs)
+            self.metadata_embedding = LabelConditioner(
+                n_time=self.n_time, include_time_signal=not self.audio_conditioning, **metadata_conditioning_kwargs
+            )
 
         # TODO as the prior type can change, can't rename to decoder or enc_dec
         if config.single_enc_dec[-level - 1]:
@@ -2182,7 +2243,7 @@ def conditioner_block(_level):
             self.prior_dims = [np.prod(shape) for shape in self.prior_shapes]
             self.prior_embed_dim_shift = np.cumsum([0, *self.prior_embed_dim])[:-1]
             self.prior_width = prior_kwargs["width"]
-            
+
             # print(f"Creating cond. autoregress with prior embed_dim {self.prior_embed_dim}, ")
             # print(f"dims {self.prior_dims}, ")
             # print(f"shift {self.prior_embed_dim_shift}")
@@ -2190,9 +2251,9 @@ def conditioner_block(_level):
             # print(f"input embed_dim (vocab size of the embedding layer) {sum(self.prior_embed_dim)}")
             # print(f"Self copy is {self.copy_input}")
 
-            # lyrics_enc_loss_dims was the prime loss dims, gen is for the generated tokens. 
-            # what is the shape of the lyrics loss? 
-            
+            # lyrics_enc_loss_dims was the prime loss dims, gen is for the generated tokens.
+            # what is the shape of the lyrics loss?
+
             self.lyrics_enc_loss_dims, self.gen_loss_dims = self.prior_dims[0], self.prior_dims[1]
             self.total_loss_dims = self.lyrics_enc_loss_dims + self.gen_loss_dims
             self.prior = JukeboxConditionalAutoregressive(
@@ -2211,7 +2272,11 @@ def conditioner_block(_level):
                 self.lyrics_enc_loss_dims = np.prod(lyric_enc_input_shape)
                 self.lyric_acts_width, self.lyric_enc_width = lyric_enc_kwargs["width"], prior_kwargs["width"]
                 self.lyric_encoder = JukeboxConditionalAutoregressive(
-                    input_shape=lyric_enc_input_shape, audio_conditioning=False, metadata_conditioning=False, only_encode=True, **lyric_enc_kwargs
+                    input_shape=lyric_enc_input_shape,
+                    audio_conditioning=False,
+                    metadata_conditioning=False,
+                    only_encode=True,
+                    **lyric_enc_kwargs,
                 )
                 self.lyric_encoder_proj_out = JukeboxConv1D(self.lyric_acts_width, self.lyric_enc_width)
                 self.lyric_encoder_layer_norm = JukeboxLayerNorm(self.lyric_enc_width)
@@ -2222,7 +2287,7 @@ def conditioner_block(_level):
                 self.lyrics_enc_loss_dims = 0
             self.gen_loss_dims = np.prod(self.music_tokens_shape)
             self.total_loss_dims = self.lyrics_enc_loss_dims + self.gen_loss_dims
-            
+
             # prior on the tokens
             self.prior = JukeboxConditionalAutoregressive(
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
@@ -2262,7 +2327,7 @@ def get_metadata(self, labels, start, total_length, offset, get_indices=False):
 
     def set_metadata_lyric_tokens(self, labels):
         """
-            Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens. 
+        Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens.
         """
         if self.n_tokens > 0:
             tokens_list = torch.zeros((labels.shape[0], self.n_tokens), dtype=torch.long, device=labels.device)
@@ -2274,17 +2339,22 @@ def set_metadata_lyric_tokens(self, labels):
                 tokens_list[idx, :] = tokens
                 indices_list.append(indices)
 
-            return torch.cat((labels[:, : 4 + self.metadata_embedding.max_bow_genre_size], tokens_list), dim=-1), indices_list
+            return (
+                torch.cat((labels[:, : 4 + self.metadata_embedding.max_bow_genre_size], tokens_list), dim=-1),
+                indices_list,
+            )
         else:
             return labels, None
 
     def get_music_tokens_conds(self, music_tokens, start, end):
         """
-            Extracts current level's conditioning music tokens. 
+        Extracts current level's conditioning music tokens.
         """
         if self.level != self.levels - 1:
             assert start % self.cond_downsample == end % self.cond_downsample == 0
-            music_tokens_cond = music_tokens[self.level + 1][:, start // self.cond_downsample : end // self.cond_downsample]
+            music_tokens_cond = music_tokens[self.level + 1][
+                :, start // self.cond_downsample : end // self.cond_downsample
+            ]
             assert music_tokens_cond.shape[1] == self.n_ctx // self.cond_downsample
             music_tokens_conds = [music_tokens_cond]
         else:
@@ -2293,9 +2363,8 @@ def get_music_tokens_conds(self, music_tokens, start, end):
 
     def prior_preprocess(self, tokens, conds):
         """
-            Shifts the input tokens to account for the dictionnary merge. 
-            The prior_embed_dim_shift give by how much. the music tokens should be
-            shifted by + nb_vocab.
+        Shifts the input tokens to account for the dictionnary merge. The prior_embed_dim_shift give by how much. the
+        music tokens should be shifted by + nb_vocab.
         """
         N = tokens[0].shape[0]
         for i in range(len(tokens)):
@@ -2310,9 +2379,9 @@ def prior_preprocess(self, tokens, conds):
 
     def prior_postprocess(self, tokens):
         """
-        Shifts back the input tokens if the model is uses an encoder decoder architecture. 
-        As the embedding layer is shared, prior_embed_dim_shift shifts the music token ids by
-         - nb_vocab. 
+        Shifts back the input tokens if the model is uses an encoder decoder architecture. As the embedding layer is
+        shared, prior_embed_dim_shift shifts the music token ids by
+         - nb_vocab.
         Returns : only returns the music tokens
         """
         N = tokens.shape[0]
@@ -2330,10 +2399,10 @@ def prior_postprocess(self, tokens):
             )  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
 
         return tokens[-1]
-    
+
     def embed_tokens(self, music_tokens_conds):
         """
-            Embeds the upper level music tokens and upsamples them to provide as audio conditioning. 
+        Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
         """
         music_tokens_conds = music_tokens_conds[: self.cond_level - self.level]
         audio_conditioning = None
@@ -2344,7 +2413,7 @@ def embed_tokens(self, music_tokens_conds):
     # Used in the forward pass
     def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
         """
-            Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states. 
+        Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states.
         """
         if start_level is None:
             start_level = self.level
@@ -2352,13 +2421,14 @@ def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
             end_level = self.levels
         # Get latents
         with torch.no_grad():
-            latent_states = self.encoder(hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+            latent_states = self.encoder(
+                hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks
+            )
         return latent_states
 
-    
     def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
         """
-            Usamples the sequence of codebook vectors  to a raw audio.  
+        Usamples the sequence of codebook vectors to a raw audio.
         """
         if start_level is None:
             start_level = self.level
@@ -2370,14 +2440,16 @@ def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
 
     def get_cond(self, music_tokens_conds, metadata):
         """
-            Converts the tokens to the input_embeddings. Splits the lyrics and the metadata. Lyric tokens can be None
+        Converts the tokens to the input_embeddings. Splits the lyrics and the metadata. Lyric tokens can be None
         """
         if metadata is not None:
             n_labels = metadata.shape[1] - self.n_tokens
             metadata, lyric_tokens = metadata[:, :n_labels], metadata[:, n_labels:]
         else:
             metadata, lyric_tokens = None, None
-        metadata_conditioning, metadata_pos = self.metadata_embedding(metadata) if self.metadata_conditioning else (None, None)
+        metadata_conditioning, metadata_pos = (
+            self.metadata_embedding(metadata) if self.metadata_conditioning else (None, None)
+        )
         audio_conditioning = self.embed_tokens(music_tokens_conds) if self.audio_conditioning else metadata_pos
         return audio_conditioning, metadata_conditioning, lyric_tokens
 
@@ -2395,7 +2467,7 @@ def sample(
         sample_tokens=None,
     ):
         """
-            Ancestral sampling a window of tokens using the provided conditioning and metadatas
+        Ancestral sampling a window of tokens using the provided conditioning and metadatas
         """
         no_past_context = music_tokens is None or music_tokens.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
@@ -2406,9 +2478,13 @@ def sample(
             audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
             if self.single_enc_dec:
                 if no_past_context:
-                    music_tokens, audio_conditioning = self.prior_preprocess([lyric_tokens], [None, audio_conditioning])
+                    music_tokens, audio_conditioning = self.prior_preprocess(
+                        [lyric_tokens], [None, audio_conditioning]
+                    )
                 else:
-                    music_tokens, audio_conditioning = self.prior_preprocess([lyric_tokens, music_tokens], [None, audio_conditioning])
+                    music_tokens, audio_conditioning = self.prior_preprocess(
+                        [lyric_tokens, music_tokens], [None, audio_conditioning]
+                    )
                 if sample_tokens is not None:
                     sample_tokens += self.n_tokens
                 tokens = self.prior.primed_sample(
@@ -2456,8 +2532,8 @@ def sample(
 
     def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
         """
-            Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. 
-            Forwards through the lyric encoder. 
+        Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
+        the lyric encoder.
         """
         if self.n_tokens != 0 and self.use_tokens:
             if sample:
@@ -2475,43 +2551,54 @@ def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
 
     def get_lyric_enc_loss(self, lyric_encoder_states, target_lyrics):
         """
-            Computes the loss for the lyric encoder, next token prediction.
+        Computes the loss for the lyric encoder, next token prediction.
         """
         if self.use_tokens:
             lyric_encoder_states = lyric_encoder_states.float()
             lyric_encoder_states = self.lyric_encoder.proj_out(lyric_encoder_states)
-            lyric_enc_loss = nn.functional.cross_entropy(lyric_encoder_states.view(-1, self.lyric_enc_dim), target_lyrics.view(-1)) / np.log(
-                2.0
-            )
+            lyric_enc_loss = nn.functional.cross_entropy(
+                lyric_encoder_states.view(-1, self.lyric_enc_dim), target_lyrics.view(-1)
+            ) / np.log(2.0)
         else:
             lyric_enc_loss = torch.tensor(0.0, device="cuda")
         return lyric_enc_loss
 
-    def forward_tokens(self, music_tokens, music_tokens_conds=[], metadata=None, fp16=False, get_preds=False, get_attn_weights=False):
+    def forward_tokens(
+        self, music_tokens, music_tokens_conds=[], metadata=None, fp16=False, get_preds=False, get_attn_weights=False
+    ):
         """
-        Applies a forward pass using the conditioning tokens. Different from the classif forward as 
-        it does not use the vqvae's encoding layers. 
-            
+        Applies a forward pass using the conditioning tokens. Different from the classif forward as it does not use the
+        vqvae's encoding layers.
+
         Args:
             get_attn_weights (bool or set): Makes forward prop dump
-                self-attention softmaxes to self.prior.transformer.saved_attn_weights. Either a set of layer indices indicating which
-                layers to store, or a boolean value indicating whether to dump all.
+                self-attention softmaxes to self.prior.transformer.saved_attn_weights. Either a set of layer indices
+                indicating which layers to store, or a boolean value indicating whether to dump all.
         """
         if get_attn_weights:
             self.prior.transformer.set_record_attn(get_attn_weights)
         audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
         if self.copy_input:
             lyric_tokens = music_tokens[:, : self.n_tokens]
-            
-        if self.single_enc_dec: # the preprocess returns the full tokens, shifted
-            tokens, audio_conditioning = self.prior_preprocess([lyric_tokens, music_tokens], [None, audio_conditioning])
+
+        if self.single_enc_dec:  # the preprocess returns the full tokens, shifted
+            tokens, audio_conditioning = self.prior_preprocess(
+                [lyric_tokens, music_tokens], [None, audio_conditioning]
+            )
             (prime_loss, gen_loss), preds = self.prior(
                 tokens, audio_conditioning, metadata_conditioning, fp16=fp16, get_sep_loss=True, get_preds=get_preds
             )
         else:
             lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens, fp16=fp16)
             prime_loss = self.get_lyric_enc_loss(lyric_encoder_states, lyric_tokens)
-            gen_loss, preds = self.prior(music_tokens, audio_conditioning, metadata_conditioning, lyric_encoder_states, fp16=fp16, get_preds=get_preds)
+            gen_loss, preds = self.prior(
+                music_tokens,
+                audio_conditioning,
+                metadata_conditioning,
+                lyric_encoder_states,
+                fp16=fp16,
+                get_preds=get_preds,
+            )
         loss = (self.prime_loss_fraction * prime_loss * self.lyrics_enc_loss_dims / self.total_loss_dims) + (
             gen_loss * self.gen_loss_dims / self.total_loss_dims
         )
@@ -2530,7 +2617,13 @@ def forward_tokens(self, music_tokens, music_tokens_conds=[], metadata=None, fp1
     def forward(self, hidden_states, metadata=None, fp16=False, decode=False, get_preds=False):
         batch_size = hidden_states.shape[0]
         music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size)
-        loss, metrics = self.forward_tokens(music_tokens=music_tokens, music_tokens_conds=music_tokens_conds, metadata= metadata, fp16=fp16, get_preds=get_preds)
+        loss, metrics = self.forward_tokens(
+            music_tokens=music_tokens,
+            music_tokens_conds=music_tokens_conds,
+            metadata=metadata,
+            fp16=fp16,
+            get_preds=get_preds,
+        )
         if decode:
             dequantised_states = self.decode([music_tokens, *music_tokens_conds])
         else:
@@ -2605,7 +2698,7 @@ def get_starts(total_length, n_ctx, hop_length):
 # FIXME, consumes too much RAM so should probably be removed
 def get_alignment(music_tokens, labels, prior, level, fp16, hps):
     """
-    Should compute the lyric to music token alignment, but for now it cannot be used. 
+    Should compute the lyric to music token alignment, but for now it cannot be used.
     """
     level = level - 1  # Top level used
     n_ctx = prior.n_ctx
@@ -2637,7 +2730,9 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
         metadata_bs = torch.chunk(metadata, batch_size, dim=0)
         w_hops = []
         for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
-            w_hop = prior.forward_tokens(tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers)
+            w_hop = prior.forward_tokens(
+                tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers
+            )
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
         w = torch.cat(w_hops, dim=0)
@@ -2772,7 +2867,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         music_tokens_conds = prior.get_music_tokens_conds(music_tokens, start, end)
         # if there are no levels above should return None!
 
-        # set metadata offset, sample_length and lyrics tokens 
+        # set metadata offset, sample_length and lyrics tokens
         metadata = prior.get_metadata(labels, start, self.config.sample_length, offset)
 
         empty_cache()
@@ -2783,8 +2878,16 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         music_tokens_conds_list = split_batch(music_tokens_conds, n_samples, max_batch_size)
         metadata_list = split_batch(metadata, n_samples, max_batch_size)
         tokens = []
-        for music_tokens_i, music_tokens_conds_i, metadata_i in zip(music_tokens_list, music_tokens_conds_list, metadata_list):
-            tokens_i = prior.sample(n_samples=music_tokens_i.shape[0], music_tokens=music_tokens_i, music_tokens_conds=music_tokens_conds_i, metadata= metadata_i, **sampling_kwargs)
+        for music_tokens_i, music_tokens_conds_i, metadata_i in zip(
+            music_tokens_list, music_tokens_conds_list, metadata_list
+        ):
+            tokens_i = prior.sample(
+                n_samples=music_tokens_i.shape[0],
+                music_tokens=music_tokens_i,
+                music_tokens_conds=music_tokens_conds_i,
+                metadata=metadata_i,
+                **sampling_kwargs,
+            )
             tokens.append(tokens_i)
         sampled_tokens = torch.cat(tokens, dim=0)
 
@@ -2885,7 +2988,6 @@ def _sample(
                 logdir = f"{self.start_time}/level_{level}"
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
-                # torch.save(dict(music_tokens=music_tokens, labels=labels, sampling_kwargs=sampling_kwargs, raw_audio=raw_audio), f"{logdir}/data.pth.tar")
                 save_wav(logdir, level, metas=metas, aud=raw_audio, sr=self.config.sr)
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0:
                     empty_cache()

From 70e9191b0a87822057f963097bedcfdeabd5d501 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 10 Aug 2022 14:29:40 +0000
Subject: [PATCH 085/196] major renameing and cleaning parameters from config
 file

---
 .../models/jukebox/configuration_jukebox.py   | 385 ++++++------------
 .../models/jukebox/convert_jukebox.py         | 313 +++++++-------
 .../models/jukebox/modeling_jukebox.py        | 200 +++++----
 3 files changed, 394 insertions(+), 504 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 13e10e35e7be2..849996510ec1b 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -22,7 +22,7 @@
 logger = logging.get_logger(__name__)
 
 JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "ArthurZ/jukebox-dummy": "https://huggingface.co/ArthurZ/jukebox-dummy/blob/main/config.json",
+    "ArthurZ/jukebox-5b-lyrics": "https://huggingface.co/ArthurZ/jukebox-5b-lyrics/blob/main/config.json",
     "ArthurZ/jukebox-1b-lyrics": "https://huggingface.co/ArthurZ/jukebox-1b-lyrics/blob/main/config.json",
 }
 
@@ -33,7 +33,7 @@ class JukeboxConfig(PretrainedConfig):
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
-    yield a similar configuration to that of the Speech2Text
+    yield a similar configuration to that of
     [ArthurZ/jukebox-1b-lyrics](https://huggingface.co/ArthurZ/jukebox-1b-lyrics) architecture.
 
 
@@ -42,41 +42,7 @@ class JukeboxConfig(PretrainedConfig):
     to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 50257):
-            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`JukeboxModel`]].
-        n_positions (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        n_embd (`int`, *optional*, defaults to 768):
-            Dimensionality of the embeddings and hidden states.
-        n_layer (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        n_head (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        n_inner (`int`, *optional*, defaults to None):
-            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
-        activation_function (`str`, *optional*, defaults to `"gelu"`):
-            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
-        resid_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (`int`, *optional*, defaults to 0.1):
-            The dropout ratio for the embeddings.
-        attn_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon to use in the layer normalization layers.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_attn_weights (`bool`, *optional*, defaults to `True`):
-            Scale attention weights by dividing by sqrt(hidden_size)..
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
-            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
-        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
-            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
-            dot-product/softmax to float() when training with mixed precision.
+
 
     Example:
 
@@ -102,112 +68,43 @@ class JukeboxConfig(PretrainedConfig):
         "num_attention_heads": "n_head",
         "num_hidden_layers": "n_layer",
     }
-    # params are given for the `n` priors at the same time which means that you have
-    # level2,level1,level0
 
     def __init__(
         self,
-        vocab_size=50257,
-        n_positions=1024,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        n_inner=None,
-        emb_dropout=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        summary_type="cls_index",
-        summary_use_proj=True,
-        summary_activation=None,
-        summary_proj_to_labels=True,
-        summary_first_dropout=0.1,
-        scale_attn_weights=True,
-        use_cache=True,
-        bos_token_id=50256,
-        eos_token_id=50256,
-        scale_attn_by_inverse_layer_idx=False,
-        reorder_and_upcast_attn=False,
-        # Global paranmeters
-        sr=16000,
-        sample_length=None,
-        sample_length_in_seconds=1,
-        y_bins=[(120, 4111), (120, 4111), (120, 4111)],
-        use_nonrelative_specloss=True,
+        sampling_rate=44100,
+        metadata_dims=[(604, 7898), (120, 4111), (120, 4111)],
         copy_input=False,
-        resid_dropout=0.0,
-        # MLP parameters
-        mlp_init_scale=0.02,
-        # Attention layer parameters
-        attn_dropout=0.0,
-        attn_init_scale=1.0,
-        # transformer parameters
-        activation_function="gelu_new",
-        sample_hop_length=30000,
-        hop_length=256,
-        multispec_loss_n_fft=(2048, 1024, 512),
-        multispec_loss_hop_length=(240, 120, 50),
-        multispec_loss_window_size=(1200, 600, 240),
-        vq_vae_levels=3,
-        vq_vae_downs_t=(3, 2, 2),
-        vq_vae_strides_t=(2, 2, 2),
-        vq_vae_emmbedding_width=2048,
-        vq_vae_codebook_dimension=2048,
-        vq_vae_width=64,
-        vq_vae_depth=4,
-        vq_vae_m_conv=1,
-        vq_vae_dilation_growth_rate=3,
-        vq_vae_dilation_cycle=None,
-        vq_vae_multipliers=(2, 1, 1),
-        vq_vae_lmu=0.99,  # for the ema?
-        vq_vae_commit=0.02,
-        vq_vae_conv_block_depth=4,
-        vq_vae_conv_block_width=64,
-        spectral=0.0,
-        multispectral=1.0,
-        # vq_vae_loss_fn = 'l1',
-        vq_vae_reverse_decoder_dilation=1,
-        # parameters always false/useless at inference
         nb_priors=3,
-        spread=None,
-        prime_spread=None,
-        zero_out=False,
-        res_scale=False,
-        pos_init=False,
-        cond_zero_out=False,
-        # args for the priors, 3 priors
-        n_ctx=(8192, 8192, 8192),
-        t_bins=128,  # TODO rename to timing_embed_dim
-        downs_t=(3, 2, 2),
-        strides_t=(2, 2, 2),
+        timing_dims=128,
         single_enc_dec=[True, False, False],
-        labels=False,
+        metadata_conditioning=True,
         merged_decoder=[True, False, False],
-        priors_width=[4096, 2048, 1024],
-        latent_dim=2048,
-        width=[4800, 1920, 128],
-        depth=[79, 72, 72],
-        n_heads=[8, 1, 1],
-        use_tokens=[True, False, False],
-        n_tokens=[512, 0, 0],
-        attn_order=[10, 2, 2],
-        blocks=16,
-        c_res=1,
-        init_scale=[0.7, 1, 1],
+        lyric_conditioning=[True, False, False],
+        nb_relevant_lyric_tokens=[384, 0, 0],
+        min_duration=17.84,
+        max_duration=600.0,
+        fp16_params=True,
+        max_nb_genres=5,
+        init_std=0.2,
+        hop_fraction=[0.125, 0.5, 0.5],
+        cond_zero_out=False,
         cond_depth=[3, 16, 16],
         cond_width=[128, 1024, 1024],
         cond_dilation_growth_rate=[1, 3, 3],
         cond_dilation_cycle=[None, 8, 8],
         cond_c_res=[0, 1, 1],
         cond_res_scale=[None, True, False],
+        cond_m_conv=1,
+        cond_downs_t=(3, 2, 2),
+        cond_strides_t=(2, 2, 2),
+        prime_spread=None,
         prime_width=[128, 128, 128],
         prime_depth=[18, 3, 3],
-        prime_cond_c_res=[0, 1, 1],
         prime_heads=4,
         prime_m_attn=0.25,
         prime_m_mlp=1.0,
         prime_blocks=32,
         prime_init_scale=[0.1, 0.4, 0.4],
-        prime_c_res=1,
         prime_loss_fraction=[0.4, 0.0, 0.0],
         prime_attn_order=[2, 0, 0],
         prime_attn_dropout=0.0,
@@ -216,146 +113,132 @@ def __init__(
         prime_zero_out=False,
         prime_res_scale=False,
         prime_pos_init=False,
-        min_duration=1,
-        max_duration=600.0,
-        fp16_params=True,
-        alignment_layer=[68, None, None],
-        alignment_head=[2, None, None],
-        m_attn=0.25,
-        n_vocab=80,
-        cond_m_conv=1,
-        max_bow_genre_size=1,  # TODO this should only be in the tokenizer
-        name="AudioSamples",
-        init_std=0.2,
+        prime_n_vocab=79,
+        prior_init_scale=[0.2, 1, 1],
+        prior_spread=None,
+        prior_zero_out=False,
+        prior_res_scale=False,
+        prior_pos_init=False,
+        prior_n_ctx=(6144, 8192, 8192),
+        prior_latent_dim=2048,
+        prior_width=[2048, 1920, 1920],
+        prior_depth=[72, 72, 72],
+        prior_n_heads=[2, 1, 1],
+        prior_attn_order=[12, 2, 2],
+        prior_blocks=64,
+        prior_alignment_layer=[68, None, None],
+        prior_alignment_head=[2, None, None],
+        prior_m_attn=0.25,
+        prior_attn_dropout=0,
+        prior_resid_dropout=0,
+        prior_emb_dropout=0,
+        # TODO rename to vqvae
+        vqvae_levels=3,
+        vqvae_downs_t=(3, 2, 2),
+        vqvae_strides_t=(2, 2, 2),
+        vqvae_emmbedding_width=64,
+        vqvae_codebook_dimension=2048,
+        vqvae_width=32,
+        vqvae_depth=4,
+        vqvae_m_conv=1,
+        vqvae_dilation_growth_rate=3,
+        vqvae_dilation_cycle=None,
+        vqvae_multipliers=(2, 1, 1),
+        vqvae_lmu=0.99,
+        vqvae_commit=0.02,
+        vqvae_conv_block_depth=4,
+        vqvae_conv_block_width=32,
+        vqvae_reverse_decoder_dilation=1,
         **kwargs,
     ):
-        self.name = name
-        self.prime_zero_out = prime_zero_out
-        self.prime_res_scale = prime_res_scale
-        self.prime_pos_init = prime_pos_init
-        self.prime_resid_dropout = prime_resid_dropout
-        self.prime_attn_dropout = prime_attn_dropout
-        self.prime_m_mlp = prime_m_mlp
-        self.prime_m_attn = prime_m_attn
-        self.prime_emb_dropout = prime_emb_dropout
-        self.prime_attn_order = prime_attn_order
-        self.vocab_size = vocab_size
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.n_inner = n_inner
-        self.activation_function = activation_function
-        self.resid_dropout = resid_dropout
-        self.emb_dropout = emb_dropout
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
-        self.reorder_and_upcast_attn = reorder_and_upcast_attn
 
-        self.max_bow_genre_size = max_bow_genre_size
-        self.cond_m_conv = cond_m_conv
-        self.n_vocab = n_vocab
-        self.sr = sr
-        self.sample_length = sample_length
-        self.sample_length_in_seconds = sample_length_in_seconds
-        self.y_bins = y_bins
-        self.use_nonrelative_specloss = use_nonrelative_specloss
+        self.fp16_params = fp16_params
+        self.init_std = init_std
         self.copy_input = copy_input
-        self.resid_dropout = resid_dropout
-        self.mlp_init_scale = mlp_init_scale
-        self.attn_dropout = attn_dropout
-        self.attn_init_scale = attn_init_scale
-
-        self.activation_function = activation_function
-        self.sample_hop_length = sample_hop_length
-        self.hop_length = hop_length
-        self.multispec_loss_n_fft = multispec_loss_n_fft
-
-        self.multispec_loss_hop_length = multispec_loss_hop_length
-
-        self.multispec_loss_window_size = multispec_loss_window_size
-
-        self.vq_vae_levels = vq_vae_levels
-        self.vq_vae_downs_t = vq_vae_downs_t
-
-        self.vq_vae_strides_t = vq_vae_strides_t
-
-        self.vq_vae_emmbedding_width = vq_vae_emmbedding_width
-        self.vq_vae_codebook_dimension = vq_vae_codebook_dimension
-        self.vq_vae_width = vq_vae_width
-        self.vq_vae_depth = vq_vae_depth
-        self.vq_vae_m_conv = vq_vae_m_conv
-        self.vq_vae_dilation_growth_rate = vq_vae_dilation_growth_rate
-        self.vq_vae_dilation_cycle = vq_vae_dilation_cycle
-        self.vq_vae_multipliers = vq_vae_multipliers
-
-        self.vq_vae_lmu = vq_vae_lmu
-
-        self.vq_vae_commit = vq_vae_commit
-        # self.spectral = spectral
-        # self.multispectral = multispectral
-
-        self.vq_vae_conv_block_depth = vq_vae_conv_block_depth
-        self.vq_vae_conv_block_width = vq_vae_conv_block_width
-        self.vq_vae_reverse_decoder_dilation = vq_vae_reverse_decoder_dilation
-
         self.nb_priors = nb_priors
-        self.spread = spread
-        self.prime_spread = prime_spread
-        self.zero_out = zero_out
-        self.res_scale = res_scale
-        self.pos_init = pos_init
-        self.cond_zero_out = cond_zero_out
-        self.n_ctx = n_ctx
-        self.t_bins = t_bins
-        self.latent_dim = latent_dim
-        self.downs_t = downs_t
-        self.strides_t = strides_t
-        self.single_enc_dec = single_enc_dec
-        self.labels = labels
-        self.merged_decoder = merged_decoder
-        self.priors_width = priors_width
-        self.width = width
-        self.depth = depth
-        self.n_heads = n_heads
-        self.use_tokens = use_tokens
-        self.n_tokens = n_tokens
-        self.attn_order = attn_order
-        self.blocks = blocks
-        self.c_res = c_res
-        self.init_scale = init_scale
-        self.prime_width = prime_width
-        self.prime_depth = prime_depth
+        self.hop_fraction = hop_fraction
+
+        #  Auto regressive (decoder) kwargs :
+        self.prior_attn_order = prior_attn_order
+        self.prior_n_heads = prior_n_heads
+        self.prior_depth = prior_depth
+        self.prior_width = prior_width
+        self.prior_n_ctx = prior_n_ctx
+        self.prior_latent_dim = prior_latent_dim
+        self.prior_attn_dropout = prior_attn_dropout
+        self.prior_resid_dropout = prior_resid_dropout
+        self.prior_emb_dropout = prior_emb_dropout
+        self.prior_zero_out = prior_zero_out
+        self.prior_res_scale = prior_res_scale
+        self.prior_pos_init = prior_pos_init
+        self.prior_blocks = prior_blocks
+        self.prior_m_attn = prior_m_attn
+        self.prior_spread = prior_spread
+        self.prior_alignment_layer = prior_alignment_layer
+        self.prior_alignment_head = prior_alignment_head
+        self.prior_init_scale = prior_init_scale
+
+        # Audio conditioning : upsampler parameters
         self.cond_depth = cond_depth
         self.cond_width = cond_width
         self.cond_dilation_growth_rate = cond_dilation_growth_rate
         self.cond_dilation_cycle = cond_dilation_cycle
         self.cond_c_res = cond_c_res
+        self.cond_zero_out = cond_zero_out
+        self.cond_m_conv = cond_m_conv
         self.cond_res_scale = cond_res_scale
-        self.prime_cond_c_res = prime_cond_c_res
-        self.prime_heads = prime_heads
-        self.prime_attn_order = prime_attn_order
-        self.prime_blocks = prime_blocks
-        self.prime_init_scale = prime_init_scale
-        self.prime_c_res = prime_c_res
-        self.prime_loss_fraction = prime_loss_fraction
+        self.cond_downs_t = cond_downs_t
+        self.cond_strides_t = cond_strides_t
+
+        # Metadata conditioning
+        self.max_nb_genres = max_nb_genres
+        self.sampling_rate = sampling_rate
+        self.metadata_dims = metadata_dims
+        self.timing_dims = timing_dims
         self.min_duration = min_duration
         self.max_duration = max_duration
-        self.fp16_params = fp16_params
-        self.alignment_layer = alignment_layer
-        self.alignment_head = alignment_head
-        self.m_attn = m_attn
+        self.metadata_conditioning = metadata_conditioning
 
-        self.init_std = init_std
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
+        # Lyric conditioning
+        self.merged_decoder = merged_decoder  # is this equivalent ?
+        self.single_enc_dec = single_enc_dec
+        self.lyric_conditioning = lyric_conditioning
+        self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
 
-        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.prime_attn_dropout = prime_attn_dropout
+        self.prime_attn_order = prime_attn_order
+        self.prime_blocks = prime_blocks
+        self.prime_depth = prime_depth
+        self.prime_emb_dropout = prime_emb_dropout
+        self.prime_heads = prime_heads
+        self.prime_init_scale = prime_init_scale
+        self.prime_loss_fraction = prime_loss_fraction
+        self.prime_m_attn = prime_m_attn
+        self.prime_m_mlp = prime_m_mlp
+        self.prime_pos_init = prime_pos_init
+        self.prime_resid_dropout = prime_resid_dropout
+        self.prime_res_scale = prime_res_scale
+        self.prime_spread = prime_spread
+        self.prime_width = prime_width
+        self.prime_zero_out = prime_zero_out
+        self.prime_n_vocab = prime_n_vocab
+
+        # VQVAE parameters (all used)
+        self.vqvae_levels = vqvae_levels
+        self.vqvae_downs_t = vqvae_downs_t
+        self.vqvae_strides_t = vqvae_strides_t
+        self.vqvae_emmbedding_width = vqvae_emmbedding_width
+        self.vqvae_codebook_dimension = vqvae_codebook_dimension
+        self.vqvae_width = vqvae_width
+        self.vqvae_depth = vqvae_depth
+        self.vqvae_m_conv = vqvae_m_conv
+        self.vqvae_dilation_growth_rate = vqvae_dilation_growth_rate
+        self.vqvae_dilation_cycle = vqvae_dilation_cycle
+        self.vqvae_multipliers = vqvae_multipliers
+        self.vqvae_lmu = vqvae_lmu
+        self.vqvae_commit = vqvae_commit
+        self.vqvae_conv_block_depth = vqvae_conv_block_depth
+        self.vqvae_conv_block_width = vqvae_conv_block_width
+        self.vqvae_reverse_decoder_dilation = vqvae_reverse_decoder_dilation
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 8e39a07e7db07..57150132ca744 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -12,10 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert ViT checkpoints trained with the DINO method."""
-
+"""Convert Jukebox checkpoints"""
 
 import argparse
+import json
 import os
 from pathlib import Path
 
@@ -43,11 +43,6 @@ def rename_key(dct, old, new):
         "5b/prior_level_1.pth.tar",
         "1b_lyrics/prior_level_2.pth.tar",
     ],
-    "jukebox-5b": [
-        "5b/vqvae.pth.tar5b/prior_level_0.pth.tar",
-        "5b/prior_level_1.pth.tar",
-        "5b/prior_level_2.pth.tar",
-    ],
     "jukebox-5b-lyrics": [
         "5b/vqvae.pth.tar5b/prior_level_0.pth.tar",
         "5b/prior_level_1.pth.tar",
@@ -57,129 +52,147 @@ def rename_key(dct, old, new):
 
 
 def replace_key(key):
-    if ".k." in key:  # replace vqvae.X.k with vqvae.X.codebook
-        return key.replace(".k.", ".codebook.")
-    elif ".y_emb." in key:
-        key = key.replace(".y_emb.", ".metadata_embedding.")
-
-
-# TODO right a clean conversion code using regex or replace
-# depending on the most appropriate choice
-def fix_jukebox_keys(state_dict, model_state_dict):
+    if key.endswith(".model.1.bias") and len(key.split(".")) > 10:
+        key = key.replace(".model.1.bias", ".conv1d_1.bias")
+    elif key.endswith(".model.1.weight") and len(key.split(".")) > 10:
+        key = key.replace(".model.1.weight", ".conv1d_1.weight")
+    elif key.endswith(".model.3.bias") and len(key.split(".")) > 10:
+        key = key.replace(".model.3.bias", ".conv1d_2.bias")
+    elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
+        key = key.replace(".model.3.weight", ".conv1d_2.weight")
+
+    if key.endswith("k"):  # replace vqvae.X.k with vqvae.X.codebook
+        return key.replace(".k", ".codebook")
+    if "y_emb." in key:
+        return key.replace("y_emb.", "metadata_embedding.")
+    if ".ln" in key:
+        return key.replace(".ln", ".layer_norm")
+    if "_ln" in key:
+        return key.replace("_ln", "_layer_norm")
+    if "prime_prior" in key:
+        return key.replace("prime_prior", "lyric_encoder")
+    if "prime_x_out" in key:
+        return key.replace("prime_x_out", "lyric_enc_proj_out")
+    if "prior.x_out" in key:  # and "conditioner_block" in key
+        return key.replace("x_out", "fc_proj_out")
+    if "x_emb" in key:  # and "conditioner_block" in key
+        return key.replace("x_emb", "embed_tokens")
+    return key
+
+def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
     new_dict = {}
-    model_unformatted_keys = {".".join(k.split(".")[2:]) for k in model_state_dict.keys()}
     import re
 
-    model_to_conv = {1: "conv1d_1", 3: "conv1d_2"}
-    re_cond_block = re.compile("conditioner_blocks.([\d]).cond.model.([\d]).([\d]).model.([\d])")
-    groups = re_cond_block.match(original_key).groups()
-    block_index = int(groups[0]) * 2 + int(groups[1])
-    re_new_key = f"conditioner_blocks.{groups[0]}.upsampler.upsample_block.{block_index}.resnet_block.{model_to_conv[groups[-1]]}"
+    re_encoder_block_conv_in = re.compile("encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
+    re_encoder_block_resnet = re.compile(
+        "encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
+    )
+    re_encoder_block_proj_out = re.compile("encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
 
-    re_cond_block.sub(re_new_key, original_key)
+    re_decoder_block_conv_out = re.compile("decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
+    re_decoder_block_resnet = re.compile(
+        "decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
+    )
+    re_decoder_block_proj_in = re.compile("decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
 
-    for original_key, value in state_dict.items():
-        key = original_key
+    re_prior_cond_conv_out = re.compile("conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)")
+    re_prior_cond_resnet = re.compile(
+        "conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
+    )
+    re_prior_cond_proj_in = re.compile("conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")
 
-        if ".k." in key:
-            key = key.replace(".k.", ".codebook.")
+    for original_key, value in state_dict.items():
 
-        elif ".y_emb." in key:
-            key = key.replace(".y_emb.", ".metadata_embedding.")
+        # rename vqvae.encoder keys
+        if re_encoder_block_conv_in.fullmatch(original_key):
+            regex_match = re_encoder_block_conv_in.match(original_key)
+            groups = regex_match.groups()
+            block_index = int(groups[2]) * 2 + int(groups[3])
+            re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}.{groups[-1]}"
+            key = re_encoder_block_conv_in.sub(re_new_key, original_key)
+
+        elif re_encoder_block_resnet.fullmatch(original_key):
+            regex_match = re_encoder_block_resnet.match(original_key)
+            groups = regex_match.groups()
+            block_index = int(groups[2]) * 2 + int(groups[3])
+            conv_index = {"1": 1, "3": 2}[groups[-2]]
+            prefix = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}."
+            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
+            re_new_key = prefix + resnet_block
+            key = re_encoder_block_resnet.sub(re_new_key, original_key)
+
+        elif re_encoder_block_proj_out.fullmatch(original_key):
+            regex_match = re_encoder_block_proj_out.match(original_key)
+            groups = regex_match.groups()
+            re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.proj_out.{groups[-1]}"
+            key = re_encoder_block_proj_out.sub(re_new_key, original_key)
+
+        # rename vqvae.decoder keys
+        elif re_decoder_block_conv_out.fullmatch(original_key):
+            regex_match = re_decoder_block_conv_out.match(original_key)
+            groups = regex_match.groups()
+            block_index = int(groups[2]) * 2 + int(groups[3]) - 2
+            re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}.{groups[-1]}"
+            key = re_decoder_block_conv_out.sub(re_new_key, original_key)
+
+        elif re_decoder_block_resnet.fullmatch(original_key):
+            regex_match = re_decoder_block_resnet.match(original_key)
+            groups = regex_match.groups()
+            block_index = int(groups[2]) * 2 + int(groups[3]) - 2
+            conv_index = {"1": 1, "3": 2}[groups[-2]]
+            prefix = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}."
+            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
+            re_new_key = prefix + resnet_block
+            key = re_decoder_block_resnet.sub(re_new_key, original_key)
+
+        elif re_decoder_block_proj_in.fullmatch(original_key):
+            regex_match = re_decoder_block_proj_in.match(original_key)
+            groups = regex_match.groups()
+            re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.proj_in.{groups[-1]}"
+            key = re_decoder_block_proj_in.sub(re_new_key, original_key)
+
+        # rename prior cond.model to upsampler.upsample_block and resnet
+        elif re_prior_cond_conv_out.fullmatch(original_key):
+            regex_match = re_prior_cond_conv_out.match(original_key)
+            groups = regex_match.groups()
+            block_index = int(groups[1]) * 2 + int(groups[2]) - 2
+            re_new_key = f"conditioner_blocks.{groups[0]}.upsampler.upsample_block.{block_index}.{groups[-1]}"
+            key = re_prior_cond_conv_out.sub(re_new_key, original_key)
+
+        elif re_prior_cond_resnet.fullmatch(original_key):
+            regex_match = re_prior_cond_resnet.match(original_key)
+            groups = regex_match.groups()
+            block_index = int(groups[1]) * 2 + int(groups[2]) - 2
+            conv_index = {"1": 1, "3": 2}[groups[-2]]
+            prefix = f"conditioner_blocks.{groups[0]}.upsampler.upsample_block.{block_index}."
+            resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
+            re_new_key = prefix + resnet_block
+            key = re_prior_cond_resnet.sub(re_new_key, original_key)
+
+        elif re_prior_cond_proj_in.fullmatch(original_key):
+            regex_match = re_prior_cond_proj_in.match(original_key)
+            groups = regex_match.groups()
+            re_new_key = f"conditioner_blocks.{groups[0]}.upsampler.proj_in.{groups[-1]}"
+            key = re_prior_cond_proj_in.sub(re_new_key, original_key)
+
+        # keep original key
         else:
-            wo_model = key.split("model")
-            if len(wo_model) == 2 and "encoders" in key:
-                if len(wo_model[1].split(".")) <= 3:
-                    key = wo_model[0] + "proj_out." + wo_model[1].split(".")[-1]
-                else:
-                    block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]))
-                    key = wo_model[0] + "downsample_block." + block_index + "." + wo_model[1].split(".")[-1]
-            elif len(wo_model) == 2 and "decoders" in key:
-                if len(wo_model[1].split(".")) <= 3:
-                    key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
-                else:
-                    block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
-                    key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
-            elif len(wo_model) == 2 and "cond.model." in key:
-                if len(wo_model[1].split(".")) <= 3:
-                    key = wo_model[0] + "proj_in." + wo_model[1].split(".")[-1]
-                else:
-                    block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
-                    key = wo_model[0] + "upsample_block." + block_index + "." + wo_model[1].split(".")[-1]
-            elif len(wo_model) == 3 and "priors" in key:
-                # should also rename cond to low_lvl_conditioner
-                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
-                key = (
-                    wo_model[0]
-                    + "upsample_block."
-                    + block_index
-                    + ".resnet_block."
-                    + wo_model[1].split(".")[-2]
-                    + ".model"
-                    + wo_model[2]
-                )
-            elif len(wo_model) == 4 and "decoders" in key:
-                # convert from
-                # model.1.0 is the first upsample block's resnet layer. Then this
-                # layer has resnet_blocks (1 to 3) which has a sequential (last model). 3 is the 3nd conv
-                # vqvae.decoders.0.level_blocks.0.model.1.0.model.1.model.3.bias
-                # to
-                # vqvae.decoders.1.level_blocks.0.upsample_block.1.resnet_blocks.2.conv1d_2.weight
-                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]) - 2)
-                key = (
-                    wo_model[0]
-                    + "upsample_block."
-                    + block_index
-                    + ".resnet_block."
-                    + wo_model[2].split(".")[1]
-                    + ".model"
-                    + wo_model[3]
-                )
-            elif len(wo_model) == 4 and "encoders" in key:
-                block_index = str(int(wo_model[1].split(".")[1]) * 2 + int(wo_model[1].split(".")[2]))
-                key = (
-                    wo_model[0]
-                    + "downsample_block."
-                    + block_index
-                    + ".resnet_block."
-                    + wo_model[2].split(".")[1]
-                    + ".model"
-                    + wo_model[3]
-                )
-
-            if key.endswith(".model.1.bias") and len(key.split(".")) > 10:
-                key = key.replace(".model.1.bias", ".conv1d_1.bias")
-            elif key.endswith(".model.1.weight") and len(key.split(".")) > 10:
-                key = key.replace(".model.1.weight", ".conv1d_1.weight")
-            elif key.endswith(".model.3.bias") and len(key.split(".")) > 10:
-                key = key.replace(".model.3.bias", ".conv1d_2.bias")
-            elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
-                key = key.replace(".model.3.weight", ".conv1d_2.weight")
-
-        if ".cond." in key:
-            key = key.replace(".cond.", ".upsampler.")
-        if ".ln" in key:
-            key = key.replace(".ln", ".layer_norm")
-        if "_ln" in key:
-            key = key.replace("_ln", "_layer_norm")
-        if "prime_prior" in key:
-            key = key.replace("prime_prior", "lyric_encoder")
-        if "prime_x_out" in key:
-            key = key.replace("prime_x_out", "lyric_enc_proj_out")
-        # if "x_emb" in key:
-        #     key = key.replace("x_emb","lyric_enc_proj_out")
-        if not "conditioner_blocks" in key and "x_emb" in key:
-            key = key.replace("x_emb", "lyric_enc.proj_out")
-        if key not in model_unformatted_keys:
+            key = original_key
+
+        key = replace_key(key)
+
+        if f"{key_prefix}.{key}" not in model_state_dict or key is None:
             print(f"failed converting {original_key} to {key}, does not match")
 
-        # elif value.shape != model_state_dict[key].shape:
-        #     print(
-        #         f"{original_key}-> {key} : \nshape {model_unformatted_keys[key].shape} and"
-        #         f" { value.shape}, do not match"
-        #     )
-        #     key = original_key
+        # handle missmatched shape
+        elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape:
+            val = model_state_dict[f"{key_prefix}.{key}"]
+            print(f"{original_key}-> {key} : \nshape {val.shape} and { value.shape}, do not match")
+            key = original_key
+
+        mapping[key] = original_key
         new_dict[key] = value
+
     return new_dict
 
 
@@ -194,16 +207,14 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
             os.makedirs(f"{pytorch_dump_folder_path}/", exist_ok=True)
             open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content)
 
-    vqvae, *priors = MODEL_MAPPING[model_name.split("/")[-1]]
-    vqvae_dic = torch.load(f"{pytorch_dump_folder_path}/{vqvae.split('/')[-1]}", map_location=torch.device("cpu"))[
-        "model"
-    ]
+    model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]]
 
     config = JukeboxConfig.from_pretrained("ArthurZ/" + model_name)
     model = JukeboxModel(config)
 
     weight_dict = []
-    for dict_name in priors:
+    mapping = {}
+    for i, dict_name in enumerate(model_to_convert):
         old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}")["model"]
 
         new_dic = {}
@@ -217,16 +228,21 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
             else:
                 new_dic[k] = old_dic[k]
 
-        new_dic = fix_jukebox_keys(new_dic, model.state_dict())
+        key_prefix = "vqvae" if i == 0 else f"priors.{i-1}"
+        new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping)
         weight_dict.append(new_dic)
 
-    model.vqvae.load_state_dict(vqvae_dic)
+    with open("mapping.json", "w") as txtfile:
+        json.dump(mapping, txtfile)
+
+    vqvae_state_dict = weight_dict.pop(0)
+    model.vqvae.load_state_dict(vqvae_state_dict)
     for i in range(len(weight_dict)):
         model.priors[i].load_state_dict(weight_dict[i])
 
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path, save_config=False)
+    model.save_pretrained(pytorch_dump_folder_path)
 
     return weight_dict
 
@@ -236,7 +252,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="jukebox-1b-lyrics",
+        default="jukebox-5b-lyrics",
         type=str,
         help="Name of the model you'd like to convert.",
     )
@@ -247,26 +263,21 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         help="Path to the output PyTorch model directory.",
     )
     args = parser.parse_args()
-    convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
-
-
-# previous code to convert dummy :
-# weight_dict = []
-# vqvae_dic = torch.load("/Users/arthur/Work/HuggingFace/jukebox/porting/vqvae.pth")
-# weight_dict.append(vqvae_dic)
-
-# for dict_name in ["up0", "up1", "up2"]:
-#     old_dic = torch.load(f"/Users/arthur/Work/HuggingFace/jukebox/porting/{dict_name}.pth")
-#     new_dic = {}
-#     for k in old_dic.keys():
-#         if k.endswith(".b"):
-#             new_dic[k.replace("b", "bias")] = old_dic[k]
-#         elif k.endswith(".w"):
-#             new_dic[k.replace("w", "weight")] = old_dic[k]
-#         elif dict_name != "up2" and "cond.model." in k:
-#             new_dic[k.replace(".blocks.", ".model.")] = old_dic[k]
-#         else:
-#             new_dic[k] = old_dic[k]
-#     weight_dict.append(new_dic)
-
-# return weight_dict
+
+    jb_5b_config = JukeboxConfig(
+        prior_attn_order=[10, 2, 2],
+        prior_blocks=128,
+        prime_n_vocab=80,
+        nb_relevant_lyric_tokens=[512, 0, 0],
+        prior_n_heads=[512, 0, 0],
+        prior_n_ctx=[8192, 8192, 8192],
+        prime_width=[1280, 128, 128],
+        prior_width=[4096, 2048, 1024],
+        single_enc_dec=[False, False, False],
+        timing_dims=128,
+        vqvae_width=64,
+        metadata_conditioning=[(120, 4111), (120, 4111), (120, 4111)],
+        min_duration=23.8,
+    )
+
+    # convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 73eb8c58424e1..30259dd310bcf 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -313,7 +313,6 @@ def level_block(level, down_t, stride_t):
         for level, down_t, stride_t in iterator:
             self.level_blocks.append(level_block(level, down_t, stride_t))
 
-        # TODO rename to proj out
         self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
 
     def forward(self, hidden_states, all_levels=True):
@@ -535,7 +534,7 @@ def forward(self, input_audio):
         for level in range(self.levels):
             level_block = self.level_blocks[-level - 1]
             hidden_states = input_audio[level]
-            sampled_tokens, quantised_states, commit_loss, metric = level_block(
+            sampled_tokens, quantised_state, commit_loss, metric = level_block(
                 hidden_states, update_codebook=self.training
             )
             music_tokens.append(sampled_tokens)
@@ -554,31 +553,31 @@ class JukeboxVQVAE(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         if not config.sample_length:
-            downsamples = calculate_strides(config.vq_vae_strides_t, config.vq_vae_downs_t)
+            downsamples = calculate_strides(config.vqvae_strides_t, config.vqvae_downs_t)
             top_raw_to_tokens = np.prod(downsamples)
             config.sample_length = (
-                (config.sample_length_in_seconds * config.sr // top_raw_to_tokens) * top_raw_to_tokens
+                (config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens) * top_raw_to_tokens
             ).astype(int)
 
         input_shape = (config.sample_length, 1)
         block_kwargs = dict(
-            width=config.vq_vae_conv_block_width,
-            depth=config.vq_vae_conv_block_depth,
-            m_conv=config.vq_vae_m_conv,
-            dilation_growth_rate=config.vq_vae_dilation_growth_rate,
-            dilation_cycle=config.vq_vae_dilation_cycle,
-            reverse_decoder_dilation=config.vq_vae_reverse_decoder_dilation,
+            width=config.vqvae_conv_block_width,
+            depth=config.vqvae_conv_block_depth,
+            m_conv=config.vqvae_m_conv,
+            dilation_growth_rate=config.vqvae_dilation_growth_rate,
+            dilation_cycle=config.vqvae_dilation_cycle,
+            reverse_decoder_dilation=config.vqvae_reverse_decoder_dilation,
         )
 
-        multipliers = config.vq_vae_multipliers
-        codebook_width = config.vq_vae_emmbedding_width
-        self.width = config.vq_vae_width
-        self.depth = config.vq_vae_depth
+        multipliers = config.vqvae_multipliers
+        codebook_width = config.vqvae_emmbedding_width
+        self.width = config.vqvae_width
+        self.depth = config.vqvae_depth
 
-        self.downs_t = downs_t = config.vq_vae_downs_t
-        self.strides_t = strides_t = config.vq_vae_strides_t
-        self.codebook_dim = codebook_dim = config.vq_vae_codebook_dimension
-        self.commit = config.vq_vae_commit
+        self.downs_t = downs_t = config.vqvae_downs_t
+        self.strides_t = strides_t = config.vqvae_strides_t
+        self.codebook_dim = codebook_dim = config.vqvae_codebook_dimension
+        self.commit = config.vqvae_commit
 
         self.sample_length = input_shape[0]
         x_shape, x_channels = input_shape[:-1], input_shape[-1]
@@ -586,7 +585,7 @@ def __init__(self, config):
 
         self.downsamples = calculate_strides(strides_t, downs_t)
         self.hop_lengths = np.cumprod(self.downsamples)
-        self.levels = levels = config.vq_vae_levels
+        self.levels = levels = config.vqvae_levels
         self.music_tokens_shapes = [(int(x_shape[0] // self.hop_lengths[-level - 1]),) for level in range(levels)]
 
         if multipliers is None:
@@ -626,7 +625,7 @@ def decoder(level):
             self.encoders.append(encoder(level))
             self.decoders.append(decoder(level))
 
-        self.bottleneck = JukeboxBottleneck(codebook_dim, codebook_width, config.vq_vae_lmu, levels)
+        self.bottleneck = JukeboxBottleneck(codebook_dim, codebook_width, config.vqvae_lmu, levels)
 
     def preprocess(self, raw_audio):
         # x: NTC [-1,1] -> NCT [-1,1]
@@ -1488,6 +1487,7 @@ def __init__(
         only_encode=False,
         merged_decoder=False,
         prime_len=None,
+        afn="quick_gelu",
     ):
         """
         - input_shape : respective dimension of the different inputs (lyrics/music_tokens)
@@ -1529,7 +1529,7 @@ def __init__(
             n_depth=depth,
             attn_dropout=attn_dropout,
             resid_dropout=resid_dropout,
-            afn="quick_gelu",
+            afn=afn,
             scale=True,
             mask=mask,
             zero_out=zero_out,
@@ -1598,7 +1598,7 @@ def forward(
         with torch.no_grad():
             tokens = self.preprocess(tokens)
 
-        N = hidden_states.shape[0]
+        N = tokens.shape[0]
         if not self.audio_conditioning:
             audio_conditioning = torch.zeros((N, 1, self.width), device=tokens.device, dtype=torch.float)
 
@@ -1900,13 +1900,10 @@ class MusicTokenConditioner(nn.Module):
 
     """
 
-    # TODO check why embed_dim is initialized to config.latent_dim which is 2048 = to codebook_di. is it
-    # latent_dim?
     def __init__(
         self, input_shape, embed_dim, down_t, stride_t, out_width, init_scale, zero_out, res_scale, **block_kwargs
     ):
         super().__init__()
-        # self.x_shape = input_shape # is this needed?
         self.width = out_width
         self.embed_tokens = nn.Embedding(embed_dim, out_width)
         nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
@@ -2023,13 +2020,13 @@ def __init__(
         self,
         metadata_dims,
         timing_dims,
-        sr,
+        sampling_rate,
         min_duration,
         max_duration,
         n_time,
         out_width,
         init_scale,
-        max_bow_genre_size,
+        max_nb_genres,
         include_time_signal,
     ):
         super().__init__()
@@ -2037,14 +2034,14 @@ def __init__(
         self.out_width = out_width
         # TODO rename bins
         bow_genre_bins, artist_bins = metadata_dims
-        self.max_bow_genre_size = max_bow_genre_size
+        self.max_nb_genres = max_nb_genres
         self.bow_genre_emb = SimpleEmbedding(bow_genre_bins, out_width, init_scale)
         self.artist_emb = SimpleEmbedding(artist_bins, out_width, init_scale)
         self.include_time_signal = include_time_signal
         if self.include_time_signal:
             t_ranges = (
-                (min_duration * sr, max_duration * sr),  # Total length
-                (0.0, max_duration * sr),  # Absolute pos
+                (min_duration * sampling_rate, max_duration * sampling_rate),  # Total length
+                (0.0, max_duration * sampling_rate),  # Absolute pos
                 (0.0, 1.0),
             )  # Relative pos
             assert len(t_ranges) == 3, f"Expecting (total, absolute, relative) ranges, got {t_ranges}"
@@ -2115,18 +2112,18 @@ def __init__(self, config, level, encoder=None, decoder=None):
         vqvae_music_tokens_shapes = config.vqvae_music_tokens_shapes
 
         def rescale(music_tokens_shape):
-            return (music_tokens_shape[0] * config.n_ctx[-level - 1] // vqvae_music_tokens_shapes[level][0],)
+            return (music_tokens_shape[0] * config.prior_n_ctx[-level - 1] // vqvae_music_tokens_shapes[level][0],)
 
         music_tokens_shapes = [rescale(music_tokens_shape) for music_tokens_shape in vqvae_music_tokens_shapes]
-        self.use_tokens = config.use_tokens[-level - 1]
-        self.n_tokens = config.n_tokens[-level - 1]
+        self.lyric_conditioning = config.lyric_conditioning[-level - 1]
+        self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens[-level - 1]
 
         # TODO rename prime loss fraction
         self.prime_loss_fraction = config.prime_loss_fraction[-level - 1]
 
         self.copy_input = config.copy_input
         if self.copy_input:
-            config.bins = config.latent_dim
+            config.bins = config.prior_latent_dim
 
         self.music_tokens_shapes = music_tokens_shapes
         self.levels = len(self.music_tokens_shapes)
@@ -2135,32 +2132,32 @@ def rescale(music_tokens_shape):
 
         self.level = level
 
-        self.latent_dim = config.latent_dim
+        self.latent_dim = config.prior_latent_dim
 
         prior_kwargs = dict(
-            input_shape=(config.n_ctx[-level - 1],),
-            embed_dim=config.latent_dim,
-            width=config.width[-level - 1],
-            depth=config.depth[-level - 1],
-            heads=config.n_heads[-level - 1],  # TODO Rename in config
-            attn_order=config.attn_order[-level - 1],
-            blocks=config.blocks,
-            spread=config.spread,
-            attn_dropout=config.attn_dropout,
-            resid_dropout=config.resid_dropout,
-            emb_dropout=config.emb_dropout,
-            zero_out=config.zero_out,
-            res_scale=config.res_scale,
-            pos_init=config.pos_init,
+            input_shape=(config.prior_n_ctx[-level - 1],),
+            embed_dim=config.prior_latent_dim,
+            width=config.prior_width[-level - 1],
+            depth=config.prior_depth[-level - 1],
+            heads=config.prior_n_heads[-level - 1],
+            attn_order=config.prior_attn_order[-level - 1],
+            blocks=config.prior_blocks,
+            spread=config.prior_spread,
+            attn_dropout=config.prior_attn_dropout,
+            resid_dropout=config.prior_resid_dropout,
+            emb_dropout=config.prior_emb_dropout,
+            zero_out=config.prior_zero_out,
+            res_scale=config.prior_res_scale,
+            pos_init=config.prior_pos_init,
             init_scale=config.init_scale[-level - 1],
-            m_attn=config.m_attn,  # m_mlp=config.m_mlp
+            m_attn=config.prior_m_attn,
         )
 
-        if config.use_tokens and not config.single_enc_dec[-level - 1]:
+        if config.lyric_conditioning and not config.single_enc_dec[-level - 1]:
             # TODO rename to encoder_kwargs as they are used both
             # when single and not
             lyric_enc_kwargs = dict(
-                n_vocab=config.n_vocab,
+                n_vocab=config.prime_n_vocab,
                 width=config.prime_width[-level - 1],
                 depth=config.prime_depth[-level - 1],
                 heads=config.prime_heads,
@@ -2178,11 +2175,11 @@ def rescale(music_tokens_shape):
                 m_mlp=config.prime_m_mlp,
             )
         else:
-            lyric_enc_kwargs = dict(n_vocab=config.n_vocab)
+            lyric_enc_kwargs = dict(n_vocab=config.prime_n_vocab)
 
         audio_conditioning_kwargs = dict(
-            out_width=config.width[-level - 1],
-            init_scale=config.init_scale[-level - 1],
+            out_width=config.prior_width[-level - 1],
+            init_scale=config.prior_init_scale[-level - 1],
             width=config.cond_width[-level - 1],
             depth=config.cond_depth[-level - 1],
             m_conv=config.cond_m_conv,
@@ -2194,14 +2191,14 @@ def rescale(music_tokens_shape):
         )  # have to keep this else names wrong
 
         metadata_conditioning_kwargs = dict(
-            out_width=config.width[-level - 1],
-            init_scale=config.init_scale[-level - 1],
-            metadata_dims=config.y_bins[-level - 1],  # rename to metadata_dims
-            timing_dims=config.t_bins,  # rename to timing_dims or timing_intervals
-            sr=config.sr,
+            out_width=config.prior_width[-level - 1],
+            init_scale=config.prior_init_scale[-level - 1],
+            metadata_dims=config.metadata_dims[-level - 1],  # rename to metadata_dims
+            timing_dims=config.timing_dims,  # rename to timing_dims or timing_intervals
+            sampling_rate=config.sampling_rate,
             min_duration=config.min_duration,
             max_duration=config.max_duration,
-            max_bow_genre_size=config.max_bow_genre_size,
+            max_nb_genres=config.max_nb_genres,
         )
 
         # Audio conditioning
@@ -2209,7 +2206,7 @@ def rescale(music_tokens_shape):
         self.cond_level = level + 1
 
         # metadata conditioning
-        self.metadata_conditioning = config.labels  # TODO change config
+        self.metadata_conditioning = config.metadata_conditioning
 
         self.single_enc_dec = config.single_enc_dec[-level - 1]
         # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both)
@@ -2219,9 +2216,9 @@ def rescale(music_tokens_shape):
             def conditioner_block(_level):
                 return MusicTokenConditioner(
                     input_shape=music_tokens_shapes[_level],
-                    embed_dim=config.latent_dim,  # TODO should we remove in favor of the vqvae dim? Maybe not
-                    down_t=config.downs_t[_level],
-                    stride_t=config.strides_t[_level],
+                    embed_dim=config.prior_latent_dim,
+                    down_t=config.cond_downs_t[_level],
+                    stride_t=config.cond_strides_t[_level],
                     **audio_conditioning_kwargs,
                 )
 
@@ -2238,19 +2235,12 @@ def conditioner_block(_level):
         # TODO as the prior type can change, can't rename to decoder or enc_dec
         if config.single_enc_dec[-level - 1]:
             # Single encoder-decoder transformer
-            self.prior_shapes = [(self.n_tokens,), prior_kwargs.pop("input_shape")]
+            self.prior_shapes = [(self.nb_relevant_lyric_tokens,), prior_kwargs.pop("input_shape")]
             self.prior_embed_dim = [lyric_enc_kwargs["n_vocab"], prior_kwargs.pop("embed_dim")]
             self.prior_dims = [np.prod(shape) for shape in self.prior_shapes]
             self.prior_embed_dim_shift = np.cumsum([0, *self.prior_embed_dim])[:-1]
             self.prior_width = prior_kwargs["width"]
 
-            # print(f"Creating cond. autoregress with prior embed_dim {self.prior_embed_dim}, ")
-            # print(f"dims {self.prior_dims}, ")
-            # print(f"shift {self.prior_embed_dim_shift}")
-            # print(f"input shape {sum(self.prior_dims)}")
-            # print(f"input embed_dim (vocab size of the embedding layer) {sum(self.prior_embed_dim)}")
-            # print(f"Self copy is {self.copy_input}")
-
             # lyrics_enc_loss_dims was the prime loss dims, gen is for the generated tokens.
             # what is the shape of the lyrics loss?
 
@@ -2267,8 +2257,8 @@ def conditioner_block(_level):
 
         else:
             # Separate encoder-decoder transformer
-            if self.n_tokens != 0 and self.use_tokens:
-                lyric_enc_input_shape = (self.n_tokens,)
+            if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
+                lyric_enc_input_shape = (self.nb_relevant_lyric_tokens,)
                 self.lyrics_enc_loss_dims = np.prod(lyric_enc_input_shape)
                 self.lyric_acts_width, self.lyric_enc_width = lyric_enc_kwargs["width"], prior_kwargs["width"]
                 self.lyric_encoder = JukeboxConditionalAutoregressive(
@@ -2298,7 +2288,7 @@ def conditioner_block(_level):
             )
 
         self.n_ctx = self.gen_loss_dims
-        self.downsamples = calculate_strides(config.strides_t, config.downs_t)
+        self.downsamples = calculate_strides(config.cond_strides_t, config.cond_downs_t)
         self.cond_downsample = self.downsamples[level + 1] if level != self.levels - 1 else None
         self.raw_to_tokens = np.prod(self.downsamples[: level + 1])
         self.sample_length = self.n_ctx * self.raw_to_tokens
@@ -2329,18 +2319,22 @@ def set_metadata_lyric_tokens(self, labels):
         """
         Processes the full labels to only retreive the relevant lyric tokens and keep the metadata conditioning tokens.
         """
-        if self.n_tokens > 0:
-            tokens_list = torch.zeros((labels.shape[0], self.n_tokens), dtype=torch.long, device=labels.device)
+        if self.nb_relevant_lyric_tokens > 0:
+            tokens_list = torch.zeros(
+                (labels.shape[0], self.nb_relevant_lyric_tokens), dtype=torch.long, device=labels.device
+            )
             indices_list = []  # whats the index of each current character in original array
             for idx in range(labels.shape[0]):
-                full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_bow_genre_size :]
+                full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_nb_genres :]
                 total_length, offset, duration = labels[idx, 0], labels[idx, 1], labels[idx, 2]
-                tokens, indices = get_relevant_lyric_tokens(full_tokens, self.n_tokens, total_length, offset, duration)
+                tokens, indices = get_relevant_lyric_tokens(
+                    full_tokens, self.nb_relevant_lyric_tokens, total_length, offset, duration
+                )
                 tokens_list[idx, :] = tokens
                 indices_list.append(indices)
 
             return (
-                torch.cat((labels[:, : 4 + self.metadata_embedding.max_bow_genre_size], tokens_list), dim=-1),
+                torch.cat((labels[:, : 4 + self.metadata_embedding.max_nb_genres], tokens_list), dim=-1),
                 indices_list,
             )
         else:
@@ -2443,7 +2437,7 @@ def get_cond(self, music_tokens_conds, metadata):
         Converts the tokens to the input_embeddings. Splits the lyrics and the metadata. Lyric tokens can be None
         """
         if metadata is not None:
-            n_labels = metadata.shape[1] - self.n_tokens
+            n_labels = metadata.shape[1] - self.nb_relevant_lyric_tokens
             metadata, lyric_tokens = metadata[:, :n_labels], metadata[:, n_labels:]
         else:
             metadata, lyric_tokens = None, None
@@ -2486,7 +2480,7 @@ def sample(
                         [lyric_tokens, music_tokens], [None, audio_conditioning]
                     )
                 if sample_tokens is not None:
-                    sample_tokens += self.n_tokens
+                    sample_tokens += self.nb_relevant_lyric_tokens
                 tokens = self.prior.primed_sample(
                     n_samples,
                     music_tokens,
@@ -2535,7 +2529,7 @@ def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
         Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
         the lyric encoder.
         """
-        if self.n_tokens != 0 and self.use_tokens:
+        if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
             if sample:
                 self.lyric_encoder = self.lyric_encoder.to(lyric_tokens.device)
             lyric_acts = self.lyric_encoder(lyric_tokens, None, None, None, fp16=fp16)
@@ -2553,7 +2547,7 @@ def get_lyric_enc_loss(self, lyric_encoder_states, target_lyrics):
         """
         Computes the loss for the lyric encoder, next token prediction.
         """
-        if self.use_tokens:
+        if self.lyric_conditioning:
             lyric_encoder_states = lyric_encoder_states.float()
             lyric_encoder_states = self.lyric_encoder.proj_out(lyric_encoder_states)
             lyric_enc_loss = nn.functional.cross_entropy(
@@ -2579,7 +2573,7 @@ def forward_tokens(
             self.prior.transformer.set_record_attn(get_attn_weights)
         audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
         if self.copy_input:
-            lyric_tokens = music_tokens[:, : self.n_tokens]
+            lyric_tokens = music_tokens[:, : self.nb_relevant_lyric_tokens]
 
         if self.single_enc_dec:  # the preprocess returns the full tokens, shifted
             tokens, audio_conditioning = self.prior_preprocess(
@@ -2696,7 +2690,7 @@ def get_starts(total_length, n_ctx, hop_length):
 
 
 # FIXME, consumes too much RAM so should probably be removed
-def get_alignment(music_tokens, labels, prior, level, fp16, hps):
+def get_alignment(music_tokens, labels, prior, level, fp16, config):
     """
     Should compute the lyric to music token alignment, but for now it cannot be used.
     """
@@ -2713,8 +2707,8 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
     else:
         padding_length = 0
 
-    hop_length = int(hps.hop_fraction[-level - 1] * prior.n_ctx)
-    alignment_head, alignment_layer = hps.alignment_head[0], hps.alignment_layer[0]
+    hop_length = int(config.hop_fraction[-level - 1] * prior.n_ctx)
+    alignment_head, alignment_layer = config.prior_alignment_head[0], config.prior_alignment_layer[0]
     attn_layers = set([alignment_layer])
     alignment_hops = {}
     indices_hops = {}
@@ -2724,7 +2718,7 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
         end = start + n_ctx
 
         # set metadata offset, sample_length and lyrics tokens
-        metadata, indices_hop = prior.get_metadata(labels, start, hps.sample_length, get_indices=True, offset=0)
+        metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
 
         tokens_bs = torch.chunk(tokens, batch_size, dim=0)
         metadata_bs = torch.chunk(metadata, batch_size, dim=0)
@@ -2740,8 +2734,8 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
         alignment_hop = w.float().cpu().numpy()
         del w
 
-        # alignment_hop has shape (bs, n_ctx, n_tokens)
-        # indices_hop is a list of len=bs, each entry of len hps.n_tokens
+        # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens)
+        # indices_hop is a list of len=bs, each entry of len hps.nb_relevant_lyric_tokens
         indices_hops[start] = indices_hop
         alignment_hops[start] = alignment_hop
     prior.cpu()
@@ -2765,7 +2759,7 @@ def get_alignment(music_tokens, labels, prior, level, fp16, hps):
     return alignments
 
 
-def save_wav(fname, lvl, metas, aud, sr):
+def save_wav(fname, lvl, metas, aud, sampling_rate):
     import soundfile
 
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
@@ -2774,16 +2768,18 @@ def save_wav(fname, lvl, metas, aud, sr):
             # twitter prompts or inputs are in the form of a dictionnary
             artists, genres, lyrics = list(metas)[i].values()
             path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}.wav"
-            soundfile.write(path, aud[i], samplerate=sr, format="wav")
+            soundfile.write(path, aud[i], samplerate=sampling_rate, format="wav")
         else:
-            soundfile.write(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i], samplerate=sr, format="wav")
+            soundfile.write(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i], samplerate=sampling_rate, format="wav")
 
 
-def load_audio(file, sr, offset, duration, mono=False):
+def load_audio(file, sampling_rate, offset, duration, mono=False):
     import librosa
 
     # Librosa loads more filetypes than soundfile
-    raw_audio, _ = librosa.load(file, sr=sr, mono=mono, offset=offset / sr, duration=duration / sr)
+    raw_audio, _ = librosa.load(
+        file, sr=sampling_rate, mono=mono, offset=offset / sampling_rate, duration=duration / sampling_rate
+    )
     if len(raw_audio.shape) == 1:
         raw_audio = raw_audio.reshape((1, -1))
     return raw_audio
@@ -2792,7 +2788,7 @@ def load_audio(file, sr, offset, duration, mono=False):
 def load_prompts(audio_files, duration, hps):
     raw_audio_list = []
     for audio_file in audio_files:
-        raw_audio = load_audio(audio_file, sr=hps.sr, duration=duration, offset=0.0, mono=True)
+        raw_audio = load_audio(audio_file, sampling_rate=hps.sampling_rate, duration=duration, offset=0.0, mono=True)
         raw_audio = raw_audio.T  # CT -> TC
         raw_audio_list.append(raw_audio)
     while len(raw_audio_list) < hps.n_samples:
@@ -2812,7 +2808,6 @@ class JukeboxModel(JukeboxPreTrainedModel):
 
     def __init__(self, config):
         super().__init__(config)
-        self.embed_dim = config.hidden_size
         self.vqvae = JukeboxVQVAE(config)
         config.vqvae_music_tokens_shapes = self.vqvae.music_tokens_shapes
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
@@ -2933,7 +2928,8 @@ def _sample(
         total_length = (
             sample_length
             if sample_length is not None
-            else (int(sample_length_in_seconds * self.config.sr) // top_prior.raw_to_tokens) * top_prior.raw_to_tokens
+            else (int(sample_length_in_seconds * self.config.sampling_rate) // top_prior.raw_to_tokens)
+            * top_prior.raw_to_tokens
         )
         sampling_kwargs = [
             dict(
@@ -2988,8 +2984,8 @@ def _sample(
                 logdir = f"{self.start_time}/level_{level}"
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
-                save_wav(logdir, level, metas=metas, aud=raw_audio, sr=self.config.sr)
-                if alignments is None and self.priors[-1] is not None and self.priors[-1].n_tokens > 0:
+                save_wav(logdir, level, metas=metas, aud=raw_audio, sampling_rate=self.config.sampling_rate)
+                if alignments is None and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
                     empty_cache()
                     alignments = get_alignment(
                         music_tokens,

From 44b45c4cd510790a5469555ef07ee8df9f076b1a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 10 Aug 2022 14:34:29 +0000
Subject: [PATCH 086/196] cleanup

---
 src/transformers/models/jukebox/convert_jukebox.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 57150132ca744..8d4803c7916ab 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -79,6 +79,7 @@ def replace_key(key):
         return key.replace("x_emb", "embed_tokens")
     return key
 
+
 def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
     new_dict = {}
     import re

From 83399fe77a331eb73d4a982092cc205ed830c959 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 10 Aug 2022 16:03:05 +0000
Subject: [PATCH 087/196] update conversion and debugged

---
 .../models/jukebox/configuration_jukebox.py   |  2 +-
 .../models/jukebox/convert_jukebox.py         | 64 +++++++++++--------
 .../models/jukebox/modeling_jukebox.py        | 22 +++----
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 849996510ec1b..705c9dd673e5b 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -75,7 +75,7 @@ def __init__(
         metadata_dims=[(604, 7898), (120, 4111), (120, 4111)],
         copy_input=False,
         nb_priors=3,
-        timing_dims=128,
+        timing_dims=64,
         single_enc_dec=[True, False, False],
         metadata_conditioning=True,
         merged_decoder=[True, False, False],
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 8d4803c7916ab..b6463d794ab9e 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -44,7 +44,8 @@ def rename_key(dct, old, new):
         "1b_lyrics/prior_level_2.pth.tar",
     ],
     "jukebox-5b-lyrics": [
-        "5b/vqvae.pth.tar5b/prior_level_0.pth.tar",
+        "5b/vqvae.pth.tar",
+        "5b/prior_level_0.pth.tar",
         "5b/prior_level_1.pth.tar",
         "5b_lyrics/prior_level_2.pth.tar",
     ],
@@ -61,21 +62,27 @@ def replace_key(key):
     elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
         key = key.replace(".model.3.weight", ".conv1d_2.weight")
 
+    if "prime_prior" in key:
+        key = key.replace("prime_prior", "lyric_encoder")
+
     if key.endswith("k"):  # replace vqvae.X.k with vqvae.X.codebook
         return key.replace(".k", ".codebook")
     if "y_emb." in key:
         return key.replace("y_emb.", "metadata_embedding.")
+    if "prime_state_ln" in key:
+        return key.replace("prime_state_ln", "lyric_encoder.final_layer_norm")
     if ".ln" in key:
         return key.replace(".ln", ".layer_norm")
     if "_ln" in key:
         return key.replace("_ln", "_layer_norm")
-    if "prime_prior" in key:
-        return key.replace("prime_prior", "lyric_encoder")
+    
+    if "prime_state_proj" in key:
+        return key.replace("prime_state_proj", "lyric_encoder.proj_in")
     if "prime_x_out" in key:
-        return key.replace("prime_x_out", "lyric_enc_proj_out")
-    if "prior.x_out" in key:  # and "conditioner_block" in key
+        return key.replace("prime_x_out", "lyric_encoder.lm_head")
+    if "prior.x_out" in key: 
         return key.replace("x_out", "fc_proj_out")
-    if "x_emb" in key:  # and "conditioner_block" in key
+    if "x_emb" in key:  
         return key.replace("x_emb", "embed_tokens")
     return key
 
@@ -210,7 +217,27 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
 
     model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]]
 
-    config = JukeboxConfig.from_pretrained("ArthurZ/" + model_name)
+    # config = JukeboxConfig.from_pretrained("ArthurZ/" + model_name)
+    # config = JukeboxConfig(
+    #     timing_dims=128
+    #     prior_attn_order=[10, 2, 2],
+    #     prior_blocks=128,
+    #     prime_n_vocab=80,
+    #     nb_relevant_lyric_tokens=[512, 0, 0],
+    #     prior_n_heads=[8, 1, 1],
+    #     prior_n_ctx=[8192, 8192, 8192],
+    #     prime_width=[1280, 128, 128],
+    #     prior_width=[4800, 1920, 1920],
+    #     single_enc_dec=[False, False, False],
+    #     timing_dims=128,
+    #     vqvae_width=64,
+    #     metadata_dims=[(120, 4111), (120, 4111), (120, 4111)],
+    #     min_duration=23.8,
+    #     sample_length= 1058304,
+    #     prior_depth=[79, 72, 72],
+    #     max_nb_genres=1,
+    # )
+    config = JukeboxConfig(sample_length= 1058304)
     model = JukeboxModel(config)
 
     weight_dict = []
@@ -253,32 +280,15 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="jukebox-5b-lyrics",
+        default="jukebox-1b-lyrics",
         type=str,
         help="Name of the model you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path",
-        default="converted_model",
+        default="jukebox-1b-lyrics-converted",
         type=str,
         help="Path to the output PyTorch model directory.",
     )
     args = parser.parse_args()
-
-    jb_5b_config = JukeboxConfig(
-        prior_attn_order=[10, 2, 2],
-        prior_blocks=128,
-        prime_n_vocab=80,
-        nb_relevant_lyric_tokens=[512, 0, 0],
-        prior_n_heads=[512, 0, 0],
-        prior_n_ctx=[8192, 8192, 8192],
-        prime_width=[1280, 128, 128],
-        prior_width=[4096, 2048, 1024],
-        single_enc_dec=[False, False, False],
-        timing_dims=128,
-        vqvae_width=64,
-        metadata_conditioning=[(120, 4111), (120, 4111), (120, 4111)],
-        min_duration=23.8,
-    )
-
-    # convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
+    convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 30259dd310bcf..6d31cabda7489 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2149,7 +2149,7 @@ def rescale(music_tokens_shape):
             zero_out=config.prior_zero_out,
             res_scale=config.prior_res_scale,
             pos_init=config.prior_pos_init,
-            init_scale=config.init_scale[-level - 1],
+            init_scale=config.prior_init_scale[-level - 1],
             m_attn=config.prior_m_attn,
         )
 
@@ -2157,7 +2157,7 @@ def rescale(music_tokens_shape):
             # TODO rename to encoder_kwargs as they are used both
             # when single and not
             lyric_enc_kwargs = dict(
-                n_vocab=config.prime_n_vocab,
+                embed_dim=config.prime_n_vocab, # previously bins
                 width=config.prime_width[-level - 1],
                 depth=config.prime_depth[-level - 1],
                 heads=config.prime_heads,
@@ -2175,7 +2175,7 @@ def rescale(music_tokens_shape):
                 m_mlp=config.prime_m_mlp,
             )
         else:
-            lyric_enc_kwargs = dict(n_vocab=config.prime_n_vocab)
+            lyric_enc_kwargs = dict(embed_dim=config.prime_n_vocab)
 
         audio_conditioning_kwargs = dict(
             out_width=config.prior_width[-level - 1],
@@ -2236,7 +2236,7 @@ def conditioner_block(_level):
         if config.single_enc_dec[-level - 1]:
             # Single encoder-decoder transformer
             self.prior_shapes = [(self.nb_relevant_lyric_tokens,), prior_kwargs.pop("input_shape")]
-            self.prior_embed_dim = [lyric_enc_kwargs["n_vocab"], prior_kwargs.pop("embed_dim")]
+            self.prior_embed_dim = [lyric_enc_kwargs["embed_dim"], prior_kwargs.pop("embed_dim")]
             self.prior_dims = [np.prod(shape) for shape in self.prior_shapes]
             self.prior_embed_dim_shift = np.cumsum([0, *self.prior_embed_dim])[:-1]
             self.prior_width = prior_kwargs["width"]
@@ -2268,11 +2268,11 @@ def conditioner_block(_level):
                     only_encode=True,
                     **lyric_enc_kwargs,
                 )
-                self.lyric_encoder_proj_out = JukeboxConv1D(self.lyric_acts_width, self.lyric_enc_width)
-                self.lyric_encoder_layer_norm = JukeboxLayerNorm(self.lyric_enc_width)
-                self.lyric_enc_dim = lyric_enc_kwargs["n_vocab"]
-                self.lyric_encoder.proj_out = nn.Linear(self.lyric_enc_width, self.lyric_enc_dim, bias=False)
-                nn.init.normal_(self.lyric_encoder.proj_out.weight, std=0.02 * prior_kwargs["init_scale"])
+                self.lyric_encoder.proj_in = JukeboxConv1D(self.lyric_acts_width, self.lyric_enc_width)
+                self.lyric_encoder.final_layer_norm = JukeboxLayerNorm(self.lyric_enc_width)
+                self.lyric_enc_dim = lyric_enc_kwargs["embed_dim"]
+                self.lyric_encoder.lm_head = nn.Linear(self.lyric_enc_width, self.lyric_enc_dim, bias=False)
+                nn.init.normal_(self.lyric_encoder.lm_head.weight, std=0.02 * prior_kwargs["init_scale"])
             else:
                 self.lyrics_enc_loss_dims = 0
             self.gen_loss_dims = np.prod(self.music_tokens_shape)
@@ -2533,8 +2533,8 @@ def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
             if sample:
                 self.lyric_encoder = self.lyric_encoder.to(lyric_tokens.device)
             lyric_acts = self.lyric_encoder(lyric_tokens, None, None, None, fp16=fp16)
-            lyric_acts = self.lyric_encoder_proj_out(lyric_acts)
-            lyric_encoder_states = self.lyric_encoder_layer_norm(lyric_acts)
+            lyric_acts = self.lyric_encoder.proj_in(lyric_acts)
+            lyric_encoder_states = self.lyric_encoder.final_layer_norm(lyric_acts)
             if sample:
                 self.lyric_encoder.cpu()
                 if fp16:

From e0048081172b1678510336251916dafb320b5a67 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 10 Aug 2022 16:44:58 +0000
Subject: [PATCH 088/196] fix remaining bug, tests pass

---
 docs/source/en/_toctree.yml                   | 575 +++++++++---------
 .../models/jukebox/modeling_jukebox.py        |   2 +-
 2 files changed, 299 insertions(+), 278 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index dd1b80442e490..35f8d6f5c6d63 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -166,282 +166,303 @@
   - sections:
     - local: model_doc/auto
       title: Auto Classes
-    - local: model_doc/bart
-      title: BART
-    - local: model_doc/barthez
-      title: BARThez
-    - local: model_doc/bartpho
-      title: BARTpho
-    - local: model_doc/beit
-      title: BEiT
-    - local: model_doc/bert
-      title: BERT
-    - local: model_doc/bert-generation
-      title: BertGeneration
-    - local: model_doc/bert-japanese
-      title: BertJapanese
-    - local: model_doc/bertweet
-      title: Bertweet
-    - local: model_doc/big_bird
-      title: BigBird
-    - local: model_doc/bigbird_pegasus
-      title: BigBirdPegasus
-    - local: model_doc/blenderbot
-      title: Blenderbot
-    - local: model_doc/blenderbot-small
-      title: Blenderbot Small
-    - local: model_doc/bloom
-      title: BLOOM
-    - local: model_doc/bort
-      title: BORT
-    - local: model_doc/byt5
-      title: ByT5
-    - local: model_doc/camembert
-      title: CamemBERT
-    - local: model_doc/canine
-      title: CANINE
-    - local: model_doc/clip
-      title: CLIP
-    - local: model_doc/codegen
-      title: CodeGen
-    - local: model_doc/convbert
-      title: ConvBERT
-    - local: model_doc/convnext
-      title: ConvNeXT
-    - local: model_doc/cpm
-      title: CPM
-    - local: model_doc/ctrl
-      title: CTRL
-    - local: model_doc/cvt
-      title: CvT
-    - local: model_doc/data2vec
-      title: Data2Vec
-    - local: model_doc/deberta
-      title: DeBERTa
-    - local: model_doc/deberta-v2
-      title: DeBERTa-v2
-    - local: model_doc/decision_transformer
-      title: Decision Transformer
-    - local: model_doc/deit
-      title: DeiT
-    - local: model_doc/detr
-      title: DETR
-    - local: model_doc/dialogpt
-      title: DialoGPT
-    - local: model_doc/distilbert
-      title: DistilBERT
-    - local: model_doc/dit
-      title: DiT
-    - local: model_doc/dpr
-      title: DPR
-    - local: model_doc/dpt
-      title: DPT
-    - local: model_doc/electra
-      title: ELECTRA
-    - local: model_doc/encoder-decoder
-      title: Encoder Decoder Models
-    - local: model_doc/flaubert
-      title: FlauBERT
-    - local: model_doc/flava
-      title: FLAVA
-    - local: model_doc/fnet
-      title: FNet
-    - local: model_doc/fsmt
-      title: FSMT
-    - local: model_doc/funnel
-      title: Funnel Transformer
-    - local: model_doc/glpn
-      title: GLPN
-    - local: model_doc/openai-gpt
-      title: GPT
-    - local: model_doc/gpt_neo
-      title: GPT Neo
-    - local: model_doc/gpt_neox
-      title: GPT NeoX
-    - local: model_doc/gptj
-      title: GPT-J
-    - local: model_doc/gpt2
-      title: GPT2
-    - local: model_doc/groupvit
-      title: GroupViT
-    - local: model_doc/herbert
-      title: HerBERT
-    - local: model_doc/hubert
-      title: Hubert
-    - local: model_doc/ibert
-      title: I-BERT
-    - local: model_doc/imagegpt
-      title: ImageGPT
-    - local: model_doc/jukebox
-      title: Jukebox
-    - local: model_doc/layoutlm
-      title: LayoutLM
-    - local: model_doc/layoutlmv2
-      title: LayoutLMV2
-    - local: model_doc/layoutlmv3
-      title: LayoutLMV3
-    - local: model_doc/layoutxlm
-      title: LayoutXLM
-    - local: model_doc/led
-      title: LED
-    - local: model_doc/levit
-      title: LeViT
-    - local: model_doc/longformer
-      title: Longformer
-    - local: model_doc/longt5
-      title: LongT5
-    - local: model_doc/luke
-      title: LUKE
-    - local: model_doc/lxmert
-      title: LXMERT
-    - local: model_doc/m2m_100
-      title: M2M100
-    - local: model_doc/marian
-      title: MarianMT
-    - local: model_doc/maskformer
-      title: MaskFormer
-    - local: model_doc/mbart
-      title: MBart and MBart-50
-    - local: model_doc/mctct
-      title: MCTCT
-    - local: model_doc/megatron-bert
-      title: MegatronBERT
-    - local: model_doc/megatron_gpt2
-      title: MegatronGPT2
-    - local: model_doc/mluke
-      title: mLUKE
-    - local: model_doc/mobilebert
-      title: MobileBERT
-    - local: model_doc/mobilevit
-      title: MobileViT
-    - local: model_doc/mpnet
-      title: MPNet
-    - local: model_doc/mt5
-      title: MT5
-    - local: model_doc/mvp
-      title: MVP
-    - local: model_doc/nezha
-      title: NEZHA
-    - local: model_doc/nllb
-      title: NLLB
-    - local: model_doc/nystromformer
-      title: Nyströmformer
-    - local: model_doc/opt
-      title: OPT
-    - local: model_doc/owlvit
-      title: OWL-ViT
-    - local: model_doc/pegasus
-      title: Pegasus
-    - local: model_doc/perceiver
-      title: Perceiver
-    - local: model_doc/phobert
-      title: PhoBERT
-    - local: model_doc/plbart
-      title: PLBart
-    - local: model_doc/poolformer
-      title: PoolFormer
-    - local: model_doc/prophetnet
-      title: ProphetNet
-    - local: model_doc/qdqbert
-      title: QDQBert
-    - local: model_doc/rag
-      title: RAG
-    - local: model_doc/realm
-      title: REALM
-    - local: model_doc/reformer
-      title: Reformer
-    - local: model_doc/regnet
-      title: RegNet
-    - local: model_doc/rembert
-      title: RemBERT
-    - local: model_doc/resnet
-      title: ResNet
-    - local: model_doc/retribert
-      title: RetriBERT
-    - local: model_doc/roberta
-      title: RoBERTa
-    - local: model_doc/roformer
-      title: RoFormer
-    - local: model_doc/segformer
-      title: SegFormer
-    - local: model_doc/sew
-      title: SEW
-    - local: model_doc/sew-d
-      title: SEW-D
-    - local: model_doc/speech-encoder-decoder
-      title: Speech Encoder Decoder Models
-    - local: model_doc/speech_to_text
-      title: Speech2Text
-    - local: model_doc/speech_to_text_2
-      title: Speech2Text2
-    - local: model_doc/splinter
-      title: Splinter
-    - local: model_doc/squeezebert
-      title: SqueezeBERT
-    - local: model_doc/swin
-      title: Swin Transformer
-    - local: model_doc/t5
-      title: T5
-    - local: model_doc/t5v1.1
-      title: T5v1.1
-    - local: model_doc/tapas
-      title: TAPAS
-    - local: model_doc/tapex
-      title: TAPEX
-    - local: model_doc/trajectory_transformer
-      title: Trajectory Transformer
-    - local: model_doc/transfo-xl
-      title: Transformer XL
-    - local: model_doc/trocr
-      title: TrOCR
-    - local: model_doc/ul2
-      title: UL2
-    - local: model_doc/unispeech
-      title: UniSpeech
-    - local: model_doc/unispeech-sat
-      title: UniSpeech-SAT
-    - local: model_doc/van
-      title: VAN
-    - local: model_doc/vilt
-      title: ViLT
-    - local: model_doc/vision-encoder-decoder
-      title: Vision Encoder Decoder Models
-    - local: model_doc/vision-text-dual-encoder
-      title: Vision Text Dual Encoder
-    - local: model_doc/vit
-      title: Vision Transformer (ViT)
-    - local: model_doc/visual_bert
-      title: VisualBERT
-    - local: model_doc/vit_mae
-      title: ViTMAE
-    - local: model_doc/wav2vec2
-      title: Wav2Vec2
-    - local: model_doc/wav2vec2-conformer
-      title: Wav2Vec2-Conformer
-    - local: model_doc/wav2vec2_phoneme
-      title: Wav2Vec2Phoneme
-    - local: model_doc/wavlm
-      title: WavLM
-    - local: model_doc/xglm
-      title: XGLM
-    - local: model_doc/xlm
-      title: XLM
-    - local: model_doc/xlm-prophetnet
-      title: XLM-ProphetNet
-    - local: model_doc/xlm-roberta
-      title: XLM-RoBERTa
-    - local: model_doc/xlm-roberta-xl
-      title: XLM-RoBERTa-XL
-    - local: model_doc/xlnet
-      title: XLNet
-    - local: model_doc/xls_r
-      title: XLS-R
-    - local: model_doc/xlsr_wav2vec2
-      title: XLSR-Wav2Vec2
-    - local: model_doc/yolos
-      title: YOLOS
-    - local: model_doc/yoso
-      title: YOSO
+    - isExpanded: false
+      sections:
+      - local: model_doc/albert
+        title: ALBERT
+      - local: model_doc/bart
+        title: BART
+      - local: model_doc/barthez
+        title: BARThez
+      - local: model_doc/bartpho
+        title: BARTpho
+      - local: model_doc/bert
+        title: BERT
+      - local: model_doc/bert-generation
+        title: BertGeneration
+      - local: model_doc/bert-japanese
+        title: BertJapanese
+      - local: model_doc/bertweet
+        title: Bertweet
+      - local: model_doc/big_bird
+        title: BigBird
+      - local: model_doc/bigbird_pegasus
+        title: BigBirdPegasus
+      - local: model_doc/blenderbot
+        title: Blenderbot
+      - local: model_doc/blenderbot-small
+        title: Blenderbot Small
+      - local: model_doc/bloom
+        title: BLOOM
+      - local: model_doc/bort
+        title: BORT
+      - local: model_doc/byt5
+        title: ByT5
+      - local: model_doc/camembert
+        title: CamemBERT
+      - local: model_doc/canine
+        title: CANINE
+      - local: model_doc/codegen
+        title: CodeGen
+      - local: model_doc/convbert
+        title: ConvBERT
+      - local: model_doc/cpm
+        title: CPM
+      - local: model_doc/ctrl
+        title: CTRL
+      - local: model_doc/deberta
+        title: DeBERTa
+      - local: model_doc/deberta-v2
+        title: DeBERTa-v2
+      - local: model_doc/dialogpt
+        title: DialoGPT
+      - local: model_doc/distilbert
+        title: DistilBERT
+      - local: model_doc/dpr
+        title: DPR
+      - local: model_doc/electra
+        title: ELECTRA
+      - local: model_doc/encoder-decoder
+        title: Encoder Decoder Models
+      - local: model_doc/flaubert
+        title: FlauBERT
+      - local: model_doc/fnet
+        title: FNet
+      - local: model_doc/fsmt
+        title: FSMT
+      - local: model_doc/funnel
+        title: Funnel Transformer
+      - local: model_doc/openai-gpt
+        title: GPT
+      - local: model_doc/gpt_neo
+        title: GPT Neo
+      - local: model_doc/gpt_neox
+        title: GPT NeoX
+      - local: model_doc/gptj
+        title: GPT-J
+      - local: model_doc/gpt2
+        title: GPT2
+      - local: model_doc/herbert
+        title: HerBERT
+      - local: model_doc/ibert
+        title: I-BERT
+      - local: model_doc/jukebox
+        title: Jukebox
+      - local: model_doc/layoutlm
+        title: LayoutLM
+      - local: model_doc/led
+        title: LED
+      - local: model_doc/longformer
+        title: Longformer
+      - local: model_doc/longt5
+        title: LongT5
+      - local: model_doc/luke
+        title: LUKE
+      - local: model_doc/m2m_100
+        title: M2M100
+      - local: model_doc/marian
+        title: MarianMT
+      - local: model_doc/mbart
+        title: MBart and MBart-50
+      - local: model_doc/megatron-bert
+        title: MegatronBERT
+      - local: model_doc/megatron_gpt2
+        title: MegatronGPT2
+      - local: model_doc/mluke
+        title: mLUKE
+      - local: model_doc/mobilebert
+        title: MobileBERT
+      - local: model_doc/mpnet
+        title: MPNet
+      - local: model_doc/mt5
+        title: MT5
+      - local: model_doc/mvp
+        title: MVP
+      - local: model_doc/nezha
+        title: NEZHA
+      - local: model_doc/nllb
+        title: NLLB
+      - local: model_doc/nystromformer
+        title: Nyströmformer
+      - local: model_doc/opt
+        title: OPT
+      - local: model_doc/pegasus
+        title: Pegasus
+      - local: model_doc/phobert
+        title: PhoBERT
+      - local: model_doc/plbart
+        title: PLBart
+      - local: model_doc/prophetnet
+        title: ProphetNet
+      - local: model_doc/qdqbert
+        title: QDQBert
+      - local: model_doc/rag
+        title: RAG
+      - local: model_doc/realm
+        title: REALM
+      - local: model_doc/reformer
+        title: Reformer
+      - local: model_doc/rembert
+        title: RemBERT
+      - local: model_doc/retribert
+        title: RetriBERT
+      - local: model_doc/roberta
+        title: RoBERTa
+      - local: model_doc/roformer
+        title: RoFormer
+      - local: model_doc/splinter
+        title: Splinter
+      - local: model_doc/squeezebert
+        title: SqueezeBERT
+      - local: model_doc/t5
+        title: T5
+      - local: model_doc/t5v1.1
+        title: T5v1.1
+      - local: model_doc/tapas
+        title: TAPAS
+      - local: model_doc/tapex
+        title: TAPEX
+      - local: model_doc/transfo-xl
+        title: Transformer XL
+      - local: model_doc/ul2
+        title: UL2
+      - local: model_doc/xglm
+        title: XGLM
+      - local: model_doc/xlm
+        title: XLM
+      - local: model_doc/xlm-prophetnet
+        title: XLM-ProphetNet
+      - local: model_doc/xlm-roberta
+        title: XLM-RoBERTa
+      - local: model_doc/xlm-roberta-xl
+        title: XLM-RoBERTa-XL
+      - local: model_doc/xlnet
+        title: XLNet
+      - local: model_doc/yoso
+        title: YOSO
+      title: Text models
+    - isExpanded: false
+      sections:
+      - local: model_doc/beit
+        title: BEiT
+      - local: model_doc/convnext
+        title: ConvNeXT
+      - local: model_doc/cvt
+        title: CvT
+      - local: model_doc/deit
+        title: DeiT
+      - local: model_doc/detr
+        title: DETR
+      - local: model_doc/dit
+        title: DiT
+      - local: model_doc/dpt
+        title: DPT
+      - local: model_doc/glpn
+        title: GLPN
+      - local: model_doc/imagegpt
+        title: ImageGPT
+      - local: model_doc/levit
+        title: LeViT
+      - local: model_doc/maskformer
+        title: MaskFormer
+      - local: model_doc/mobilevit
+        title: MobileViT
+      - local: model_doc/owlvit
+        title: OWL-ViT
+      - local: model_doc/poolformer
+        title: PoolFormer
+      - local: model_doc/regnet
+        title: RegNet
+      - local: model_doc/resnet
+        title: ResNet
+      - local: model_doc/segformer
+        title: SegFormer
+      - local: model_doc/swin
+        title: Swin Transformer
+      - local: model_doc/swinv2
+        title: Swin Transformer V2
+      - local: model_doc/van
+        title: VAN
+      - local: model_doc/videomae
+        title: VideoMAE
+      - local: model_doc/vit
+        title: Vision Transformer (ViT)
+      - local: model_doc/vit_mae
+        title: ViTMAE
+      - local: model_doc/yolos
+        title: YOLOS
+      title: Vision models
+    - isExpanded: false
+      sections:
+      - local: model_doc/hubert
+        title: Hubert
+      - local: model_doc/mctct
+        title: MCTCT
+      - local: model_doc/sew
+        title: SEW
+      - local: model_doc/sew-d
+        title: SEW-D
+      - local: model_doc/speech_to_text
+        title: Speech2Text
+      - local: model_doc/speech_to_text_2
+        title: Speech2Text2
+      - local: model_doc/unispeech
+        title: UniSpeech
+      - local: model_doc/unispeech-sat
+        title: UniSpeech-SAT
+      - local: model_doc/wav2vec2
+        title: Wav2Vec2
+      - local: model_doc/wav2vec2-conformer
+        title: Wav2Vec2-Conformer
+      - local: model_doc/wav2vec2_phoneme
+        title: Wav2Vec2Phoneme
+      - local: model_doc/wavlm
+        title: WavLM
+      - local: model_doc/xls_r
+        title: XLS-R
+      - local: model_doc/xlsr_wav2vec2
+        title: XLSR-Wav2Vec2
+      title: Audio models
+    - isExpanded: false
+      sections:
+      - local: model_doc/clip
+        title: CLIP
+      - local: model_doc/data2vec
+        title: Data2Vec
+      - local: model_doc/flava
+        title: FLAVA
+      - local: model_doc/groupvit
+        title: GroupViT
+      - local: model_doc/layoutlmv2
+        title: LayoutLMV2
+      - local: model_doc/layoutlmv3
+        title: LayoutLMV3
+      - local: model_doc/layoutxlm
+        title: LayoutXLM
+      - local: model_doc/lxmert
+        title: LXMERT
+      - local: model_doc/perceiver
+        title: Perceiver
+      - local: model_doc/speech-encoder-decoder
+        title: Speech Encoder Decoder Models
+      - local: model_doc/trocr
+        title: TrOCR
+      - local: model_doc/vilt
+        title: ViLT
+      - local: model_doc/vision-encoder-decoder
+        title: Vision Encoder Decoder Models
+      - local: model_doc/vision-text-dual-encoder
+        title: Vision Text Dual Encoder
+      - local: model_doc/visual_bert
+        title: VisualBERT
+      title: Multimodal models
+    - isExpanded: false
+      sections:
+      - local: model_doc/decision_transformer
+        title: Decision Transformer
+      - local: model_doc/trajectory_transformer
+        title: Trajectory Transformer
+      title: Reinforcement learning models
     title: Models
   - sections:
     - local: internal/modeling_utils
@@ -457,4 +478,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 6d31cabda7489..4d242ab7612ed 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2830,7 +2830,7 @@ def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, l
     # Sample a single window of length=n_ctx at position=start on level=level
     def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start):
         prior = self.priors[level]
-        n_samples = self.config.n_samples
+        n_samples = music_tokens[-1].shape[0]
         n_ctx = prior.n_ctx
         end = start + n_ctx
         # get music_tokens already sampled at current level

From 873cabd8503fd2d5f9b26c78914a2da5ab10dfa9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 10 Aug 2022 17:24:03 +0000
Subject: [PATCH 089/196] all tests pass (modelling tests)

---
 .../models/jukebox/configuration_jukebox.py     |  6 +++---
 .../models/jukebox/convert_jukebox.py           |  8 ++++----
 .../models/jukebox/modeling_jukebox.py          | 10 +++++-----
 .../models/jukebox/tokenization_jukebox.py      | 12 ++++--------
 tests/models/jukebox/test_modeling_jukebox.py   | 17 ++++++++---------
 5 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 705c9dd673e5b..1ca63dca5d4dc 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -22,8 +22,8 @@
 logger = logging.get_logger(__name__)
 
 JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "ArthurZ/jukebox-5b-lyrics": "https://huggingface.co/ArthurZ/jukebox-5b-lyrics/blob/main/config.json",
-    "ArthurZ/jukebox-1b-lyrics": "https://huggingface.co/ArthurZ/jukebox-1b-lyrics/blob/main/config.json",
+    "openai/jukebox-5b-lyrics": "https://huggingface.co/openai/jukebox-5b-lyrics/blob/main/config.json",
+    "openai/jukebox-1b-lyrics": "https://huggingface.co/openai/jukebox-1b-lyrics/blob/main/config.json",
 }
 
 
@@ -34,7 +34,7 @@ class JukeboxConfig(PretrainedConfig):
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
     yield a similar configuration to that of
-    [ArthurZ/jukebox-1b-lyrics](https://huggingface.co/ArthurZ/jukebox-1b-lyrics) architecture.
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
 
 
     The downsampling and stride are used to determine downsampling of the input sequence. For example, downsamoling =
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index b6463d794ab9e..8e94549dba30b 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -75,14 +75,14 @@ def replace_key(key):
         return key.replace(".ln", ".layer_norm")
     if "_ln" in key:
         return key.replace("_ln", "_layer_norm")
-    
+
     if "prime_state_proj" in key:
         return key.replace("prime_state_proj", "lyric_encoder.proj_in")
     if "prime_x_out" in key:
         return key.replace("prime_x_out", "lyric_encoder.lm_head")
-    if "prior.x_out" in key: 
+    if "prior.x_out" in key:
         return key.replace("x_out", "fc_proj_out")
-    if "x_emb" in key:  
+    if "x_emb" in key:
         return key.replace("x_emb", "embed_tokens")
     return key
 
@@ -237,7 +237,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     #     prior_depth=[79, 72, 72],
     #     max_nb_genres=1,
     # )
-    config = JukeboxConfig(sample_length= 1058304)
+    config = JukeboxConfig(sample_length=1058304)
     model = JukeboxModel(config)
 
     weight_dict = []
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 4d242ab7612ed..c3cf168343614 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -52,9 +52,9 @@
 _TOKENIZER_FOR_DOC = "JukeboxTokenizer"
 
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "ArthurZ/jukebox-dummy",
-    "ArthurZ/jukebox-1b-lyrics",
-    "ArthurZ/jukebox-5b-lyrics",
+    "openai/jukebox-dummy",
+    "openai/jukebox-1b-lyrics",
+    "openai/jukebox-5b-lyrics",
     # See all Jukebox models at https://huggingface.co/models?filter=jukebox
 ]
 
@@ -1499,7 +1499,7 @@ def __init__(
         - metadata_conditioning : whether or not the prior supports conditionning on artitst, genres, lyrics and
           timing. When
         False, the start token is random.
-        - prime_len : for now ??????
+        - prime_len : for now len of the lyric hidden states
         """
         super().__init__()
         self.input_shape = input_shape
@@ -2157,7 +2157,7 @@ def rescale(music_tokens_shape):
             # TODO rename to encoder_kwargs as they are used both
             # when single and not
             lyric_enc_kwargs = dict(
-                embed_dim=config.prime_n_vocab, # previously bins
+                embed_dim=config.prime_n_vocab,  # previously bins
                 width=config.prime_width[-level - 1],
                 depth=config.prime_depth[-level - 1],
                 heads=config.prime_heads,
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index fb874a17fd4f0..f79daa3604315 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -76,7 +76,7 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
     midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
     the most relevant tokens (in time) for the sequence.
 
-    Args: # TODO : args to prettify
+    Args:
         full_tokens (`List[int]`):
             List containing the ids of the entire lyrics.
         total_length (`int`):
@@ -92,9 +92,6 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
     full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
         tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens)), full_tokens])
-        # tokens = torch.cat([0] * (max_n_lyric_tokens - len(full_tokens)), full_tokens)
-        # did not handle that before but now the full_tokens are torch tensors
-        # because the tokenizer outputs tensors and not list (choice ma)
         indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
     else:
         assert 0 <= offset < total_length
@@ -102,9 +99,9 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
         midpoint = min(max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2)
         tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2]
         indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
-    assert len(tokens) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(tokens)}"
-    assert len(indices) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(indices)}"
-    # assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
+    # assert len(tokens) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(tokens)}"
+    # assert len(indices) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(indices)}"
+    # # assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
     return tokens.unsqueeze(dim=0), indices
 
 
@@ -141,7 +138,6 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to:
     this superclass for more information regarding those methods.
 
-    # TODO: the original paper should support composing from 2 or more artists and genres.
     However the code does not allow that and only supports composing from various genres.
 
     Args:
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index b8317860c401a..22e4c157a2ecf 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -28,7 +28,7 @@
 @require_torch
 class Jukebox1bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "ArthurZ/jukebox-1b-lyrics"
+    model_id = "openai/jukebox-1b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -163,8 +163,8 @@ def test_slow_sampling(self):
 
         top_prior = model.priors[-1]
         start = 0
-        z_conds = top_prior.get_z_conds(zs, start=start, end=start + top_prior.n_ctx)
-        y = top_prior.get_y(labels[-1].clone(), start, 1058304, 0)
+        z_conds = top_prior.get_music_tokens_conds(zs, start=start, end=start + top_prior.n_ctx)
+        y = top_prior.get_metadata(labels[-1].clone(), start, 1058304, 0)
 
         self.assertIsNone(z_conds)
         self.assertListEqual(y.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
@@ -233,6 +233,7 @@ def test_vqvae(self):
 @require_torch
 class Jukebox5bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
+    model_id = "openai/jukebox-5b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -309,10 +310,9 @@ def prepare_inputs(self, model_id):
 
     @slow
     def test_sampling(self):
-        model_id = "ArthurZ/jukebox-5b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id, min_duration=0).eval()
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
 
-        labels = self.prepare_inputs(model_id)
+        labels = self.prepare_inputs(self.model_id)
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
@@ -333,10 +333,9 @@ def test_sampling(self):
 
     @slow
     def test_slow_sampling(self):
-        model_id = "ArthurZ/jukebox-5b-lyrics"
-        model = JukeboxModel.from_pretrained(model_id, min_duration=0).eval().to("cuda")
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval().to("cuda")
 
-        labels = [i.cuda() for i in self.prepare_inputs(model_id)]
+        labels = [i.cuda() for i in self.prepare_inputs(self.model_id)]
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)

From 15567f21d94353a7c955fc9aa6447ecdc2336d06 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 10 Aug 2022 17:32:12 +0000
Subject: [PATCH 090/196] should start documenting

---
 src/transformers/models/jukebox/convert_jukebox.py | 9 ++++++---
 tests/models/jukebox/test_tokenization_jukebox.py  | 4 ++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 8e94549dba30b..7743afd2bb8a7 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -217,7 +217,8 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
 
     model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]]
 
-    # config = JukeboxConfig.from_pretrained("ArthurZ/" + model_name)
+    # config = JukeboxConfig.from_pretrained("openai/" + model_name)
+    # to convert the 5b lyric token model, use : or "openai/jukebox-5b-lyrics"
     # config = JukeboxConfig(
     #     timing_dims=128
     #     prior_attn_order=[10, 2, 2],
@@ -260,8 +261,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping)
         weight_dict.append(new_dic)
 
-    with open("mapping.json", "w") as txtfile:
-        json.dump(mapping, txtfile)
+
 
     vqvae_state_dict = weight_dict.pop(0)
     model.vqvae.load_state_dict(vqvae_state_dict)
@@ -269,6 +269,9 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         model.priors[i].load_state_dict(weight_dict[i])
 
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile:
+        json.dump(mapping, txtfile, sep="\n")
+
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
index 74161be6794ab..beac3ec246c81 100644
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -49,7 +49,7 @@ def test_1b_lyrics_tokenizer(self):
         """
         import torch
 
-        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox-1b-lyrics")
+        tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
         tokens = tokenizer(**self.metas)["input_ids"]
         # fmt: off
         EXPECTED_OUTPUT = [
@@ -132,7 +132,7 @@ def test_5b_lyrics_tokenizer(self):
         """
         import torch
 
-        tokenizer = JukeboxTokenizer.from_pretrained("ArthurZ/jukebox-5b-lyrics")
+        tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-5b-lyrics")
         tokens = tokenizer(**self.metas)["input_ids"]
         # fmt: off
         EXPECTED_OUTPUT = [

From a271c632f17c2a9cff751f1398db520391481c6d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 10 Aug 2022 17:56:12 +0000
Subject: [PATCH 091/196] nits

---
 docs/source/en/_toctree.yml                      |  3 ++-
 .../models/jukebox/configuration_jukebox.py      |  5 +----
 .../models/jukebox/convert_jukebox.py            |  7 -------
 .../models/jukebox/modeling_jukebox.py           |  2 --
 .../models/jukebox/sample_original_jukebox.py    | 16 ++++++++++++++++
 .../models/jukebox/tokenization_jukebox.py       |  5 +----
 6 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 35f8d6f5c6d63..2835366f70fba 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -478,4 +478,5 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
+  
\ No newline at end of file
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 1ca63dca5d4dc..de98aa222063b 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -61,7 +60,6 @@ class JukeboxConfig(PretrainedConfig):
     """
 
     model_type = "jukebox"
-    keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {
         "hidden_size": "n_embd",
         "max_position_embeddings": "n_positions",
@@ -132,7 +130,6 @@ def __init__(
         prior_attn_dropout=0,
         prior_resid_dropout=0,
         prior_emb_dropout=0,
-        # TODO rename to vqvae
         vqvae_levels=3,
         vqvae_downs_t=(3, 2, 2),
         vqvae_strides_t=(2, 2, 2),
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 7743afd2bb8a7..bbf8804bedd65 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -30,11 +30,6 @@
 logger = logging.get_logger(__name__)
 
 
-def rename_key(dct, old, new):
-    val = dct.pop(old)
-    dct[new] = val
-
-
 PREFIX = "https://openaipublic.azureedge.net/jukebox/models/"
 MODEL_MAPPING = {
     "jukebox-1b-lyrics": [
@@ -261,8 +256,6 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
         new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping)
         weight_dict.append(new_dic)
 
-
-
     vqvae_state_dict = weight_dict.pop(0)
     model.vqvae.load_state_dict(vqvae_state_dict)
     for i in range(len(weight_dict)):
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index c3cf168343614..ac71af93d21f1 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1,6 +1,5 @@
 # coding=utf-8
 # Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -2232,7 +2231,6 @@ def conditioner_block(_level):
                 n_time=self.n_time, include_time_signal=not self.audio_conditioning, **metadata_conditioning_kwargs
             )
 
-        # TODO as the prior type can change, can't rename to decoder or enc_dec
         if config.single_enc_dec[-level - 1]:
             # Single encoder-decoder transformer
             self.prior_shapes = [(self.nb_relevant_lyric_tokens,), prior_kwargs.pop("input_shape")]
diff --git a/src/transformers/models/jukebox/sample_original_jukebox.py b/src/transformers/models/jukebox/sample_original_jukebox.py
index 86ebae3206363..c2727e188a30a 100644
--- a/src/transformers/models/jukebox/sample_original_jukebox.py
+++ b/src/transformers/models/jukebox/sample_original_jukebox.py
@@ -1,5 +1,21 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Team Authors the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # in order to be used, the following git repo has to be used :
 # git clone --branch adaptive_device https://github.com/ArthurZucker/jukebox.git
+
 import os
 import random
 
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index f79daa3604315..485ab6fc57c7f 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+# Copyright 2022 The Open AI Team Authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -338,8 +338,6 @@ def _normalize(self, text: str) -> str:
     def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
         return " ".join(lyrics)
 
-    # TODO : should add_token be implemeted for artists, genres and lyrics? Should it have
-    # a type argument to add an artist token with self.getattr('artist') ?
     def convert_to_tensors(
         self, inputs, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
     ):
@@ -424,7 +422,6 @@ def __call__(self, artist, genres, lyrics, return_tensors="pt") -> BatchEncoding
         artists_id, genres_ids, full_tokens = self._convert_token_to_id(artists_tokens, genres_tokens, lyrics_tokens)
 
         attention_masks = [-INFINITY] * len(full_tokens[-1])
-        # TODO properly handle the return pt tensor option
         input_ids = [
             self.convert_to_tensors(
                 [input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]], tensor_type=return_tensors

From dae211d3f9e190f11a5531126d1221a4694a5179 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 11 Aug 2022 15:44:47 +0000
Subject: [PATCH 092/196] update

---
 docs/source/en/model_doc/jukebox.mdx          |  9 ++++--
 .../models/jukebox/modeling_jukebox.py        | 31 +++++++++++--------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 268bbac44ef5f..614e0ff2dd23f 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -47,12 +47,15 @@ The original code can be found [here](https://github.com/openai/jukebox).
 
 ## JukeboxTokenizer
 
-[[autodoc]] JukeboxTokenizer - save_vocabulary
+[[autodoc]] JukeboxTokenizer 
+    - save_vocabulary
 
 ## JukeboxModel
 
-[[autodoc]] JukeboxModel - forward 
+[[autodoc]] JukeboxModel 
+    - forward 
 
 ## JukeboxVQVAE 
 
-[[autodoc]] JukeboxVQVAE - forward 
+[[autodoc]] JukeboxVQVAE 
+    - forward 
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index ac71af93d21f1..619a9a7db23b2 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1548,6 +1548,7 @@ def __init__(
         # TODO rename prime_len
         self.only_encode = only_encode
         self.prime_len = prime_len
+        # TODO rename fc_pro_out to LM head an probably use HF's linking trick
         if merged_decoder:
             # Merged piped model uses this setup
             self.add_cond_after_transformer = False
@@ -1557,6 +1558,7 @@ def __init__(
             self.share_embed_tokens_fc_proj_out = True
 
         if not only_encode:
+            # TODO rename fc_pro_out to LM head an probably use HF's linking trick
             self.fc_proj_out = nn.Linear(width, embed_dim, bias=False)
             if self.share_embed_tokens_fc_proj_out:
                 self.fc_proj_out.weight = self.embed_tokens.weight
@@ -2547,7 +2549,7 @@ def get_lyric_enc_loss(self, lyric_encoder_states, target_lyrics):
         """
         if self.lyric_conditioning:
             lyric_encoder_states = lyric_encoder_states.float()
-            lyric_encoder_states = self.lyric_encoder.proj_out(lyric_encoder_states)
+            lyric_encoder_states = self.lyric_encoder.lm_head(lyric_encoder_states)
             lyric_enc_loss = nn.functional.cross_entropy(
                 lyric_encoder_states.view(-1, self.lyric_enc_dim), target_lyrics.view(-1)
             ) / np.log(2.0)
@@ -2783,15 +2785,18 @@ def load_audio(file, sampling_rate, offset, duration, mono=False):
     return raw_audio
 
 
-def load_prompts(audio_files, duration, hps):
+def load_prompts(audio_files, duration, offset, hps):
+    n_samples = len(audio_files)
     raw_audio_list = []
     for audio_file in audio_files:
-        raw_audio = load_audio(audio_file, sampling_rate=hps.sampling_rate, duration=duration, offset=0.0, mono=True)
+        raw_audio = load_audio(
+            audio_file, sampling_rate=hps.sampling_rate, duration=duration, offset=offset, mono=True
+        )
         raw_audio = raw_audio.T  # CT -> TC
         raw_audio_list.append(raw_audio)
-    while len(raw_audio_list) < hps.n_samples:
+    while len(raw_audio_list) < n_samples:
         raw_audio_list.extend(raw_audio_list)
-    raw_audio_list = raw_audio_list[: hps.n_samples]
+    raw_audio_list = raw_audio_list[:n_samples]
     raw_audio = torch.stack([torch.from_numpy(raw_audio) for raw_audio in raw_audio_list])
     return raw_audio
 
@@ -2985,14 +2990,14 @@ def _sample(
                 save_wav(logdir, level, metas=metas, aud=raw_audio, sampling_rate=self.config.sampling_rate)
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
                     empty_cache()
-                    alignments = get_alignment(
-                        music_tokens,
-                        labels[-1],
-                        self.priors[-1],
-                        level,
-                        sampling_kwargs[-1]["fp16"],
-                        self.config,
-                    )
+                    # alignments = get_alignment(
+                    #     music_tokens,
+                    #     labels[-1],
+                    #     self.priors[-1],
+                    #     level,
+                    #     sampling_kwargs[-1]["fp16"],
+                    #     self.config,
+                    # )
                     pass  # consumes too much ram
         return music_tokens
 

From c90032f202d32a741712a736304dec61e18ade3c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 15 Aug 2022 07:45:45 +0000
Subject: [PATCH 093/196] update and support for fp16

---
 .../models/jukebox/modeling_jukebox.py        | 48 +++++++++++--------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 619a9a7db23b2..853a7483ba64e 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -738,9 +738,11 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
 
     def forward(self, input):
         if input.numel() > self.max_numel:
-            return F.layer_norm(input.float(), self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
+            # return F.layer_norm(input.float(), self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
+            return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
         else:
-            return super(JukeboxLayerNorm, self).forward(input.float()).type_as(input)
+            # return super(JukeboxLayerNorm, self).forward(input.float()).type_as(input)
+            return super(JukeboxLayerNorm, self).forward(input).type_as(input)
 
 
 def repeat(hidden_states, n_repeat, dim):
@@ -1586,8 +1588,6 @@ def forward(
         metadata_conditioning=None,
         lyric_encoder_states=None,
         fp16=False,
-        loss_full=False,
-        encode=False,
         get_preds=False,
         get_acts=False,
         get_sep_loss=False,
@@ -1811,7 +1811,7 @@ def primed_sample(
                 )
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 hidden_states = self.transformer(
-                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16
+                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16, fp16_out = fp16
                 )  # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
@@ -2534,6 +2534,8 @@ def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
                 self.lyric_encoder = self.lyric_encoder.to(lyric_tokens.device)
             lyric_acts = self.lyric_encoder(lyric_tokens, None, None, None, fp16=fp16)
             lyric_acts = self.lyric_encoder.proj_in(lyric_acts)
+            if fp16:
+                lyric_acts = lyric_acts.half()
             lyric_encoder_states = self.lyric_encoder.final_layer_norm(lyric_acts)
             if sample:
                 self.lyric_encoder.cpu()
@@ -2690,11 +2692,11 @@ def get_starts(total_length, n_ctx, hop_length):
 
 
 # FIXME, consumes too much RAM so should probably be removed
-def get_alignment(music_tokens, labels, prior, level, fp16, config):
+def get_alignment(music_tokens, labels, prior, fp16, config):
     """
     Should compute the lyric to music token alignment, but for now it cannot be used.
     """
-    level = level - 1  # Top level used
+    level = prior.levels - 1  # Top level used
     n_ctx = prior.n_ctx
     tokens = music_tokens[level]
     batch_size, total_length = tokens.shape[0], tokens.shape[1]
@@ -2714,7 +2716,7 @@ def get_alignment(music_tokens, labels, prior, level, fp16, config):
     indices_hops = {}
     prior.to(tokens.device)
     empty_cache()
-    for start in get_starts(total_length, n_ctx, hop_length):
+    for start in get_range(get_starts(total_length, n_ctx, hop_length)):
         end = start + n_ctx
 
         # set metadata offset, sample_length and lyrics tokens
@@ -2866,7 +2868,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         # if there are no levels above should return None!
 
         # set metadata offset, sample_length and lyrics tokens
-        metadata = prior.get_metadata(labels, start, self.config.sample_length, offset)
+        metadata = prior.get_metadata(labels, start, self.total_length, offset)
 
         empty_cache()
         max_batch_size = sampling_kwargs["max_batch_size"]
@@ -2926,6 +2928,7 @@ def _sample(
         offset=0,
         save_results=True,
         sample_length=None,
+        fp16 = False
     ):
         top_prior = self.priors[-1]
         total_length = (
@@ -2937,21 +2940,21 @@ def _sample(
         sampling_kwargs = [
             dict(
                 temp=0.99,
-                fp16=False,
+                fp16=fp16,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
             ),
             dict(
                 temp=0.99,
-                fp16=False,
+                fp16=fp16,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
             ),
             dict(
                 temp=sampling_temperature,
-                fp16=False,
+                fp16=fp16,
                 max_batch_size=max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -2960,17 +2963,21 @@ def _sample(
         self.start_time = time.strftime("%Y-%m-%d-%Hh%M")
         if sample_levels is None:
             sample_levels = range(len(self.priors))
+
+        self.total_length = total_length # total length of the signal, might be bit different
         for level in reversed(sample_levels):
-            self.config.sample_length = total_length  # total length of the signal, might be bit different
+             
+
             # from the actual generated length
             self.priors[level].to(music_tokens[level].device).eval()
             empty_cache()
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
-            total_length = self.config.sample_length // self.priors[level].raw_to_tokens
+            # self.priors[level].total_length = total_length // self.priors[level].raw_to_tokens
+            total_token_to_sample = total_length // self.priors[level].raw_to_tokens
             hop_length = int(self.config.hop_fraction[-level - 1] * self.priors[level].n_ctx)
 
             music_tokens = self.sample_level(
-                music_tokens, labels[level], offset, sampling_kwargs[level], level, total_length, hop_length
+                music_tokens, labels[level], offset, sampling_kwargs[level], level, total_token_to_sample, hop_length
             )
 
             self.priors[level].to("cpu")
@@ -3012,20 +3019,20 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
 
     # Continue ancestral sampling from previously saved codes
     def continue_sample(self, music_tokens, labels, **sampling_kwargs):
-        sample_levels = list(range(len(self.priors)))
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
 
     # Upsample given already generated upper-level codes
     def upsample(self, music_tokens, labels, **sampling_kwargs):
-        sample_levels = list(range(len(self.priors) - 1))
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors)-1)))
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
 
     # Prompt the model with raw audio input (dimension: NTC) and generate continuations
     def primed_sample(self, raw_audio, labels, **sampling_kwargs):
-        sample_levels = list(range(len(self.priors)))
-        self.vqvae.to(raw_audio.device)
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
+        self.vqvae.to(raw_audio.device).float()
         with torch.no_grad():
             music_tokens = self.vqvae.encode(
                 raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.shape[0]
@@ -3033,3 +3040,6 @@ def primed_sample(self, raw_audio, labels, **sampling_kwargs):
         self.vqvae.to("cpu")
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
+
+# TODO add tied embeddings for the lyric encoder lm head as well as the proj_out when they are not seperated.
+# TODO should support cehckpointing attention as it is faster 
\ No newline at end of file

From 16fd65d694e1c36215c8ef2aa3349b20b333d0a2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 15 Aug 2022 07:46:54 +0000
Subject: [PATCH 094/196] begin checkpointing res for faster inference and
 lower memory consumption

---
 .../models/jukebox/modeling_jukebox.py        | 56 +++++++++++++------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index c3cf168343614..db205aea896b2 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -739,9 +739,9 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
 
     def forward(self, input):
         if input.numel() > self.max_numel:
-            return F.layer_norm(input.float(), self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
+            return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
         else:
-            return super(JukeboxLayerNorm, self).forward(input.float()).type_as(input)
+            return super(JukeboxLayerNorm, self).forward(input).type_as(input)
 
 
 def repeat(hidden_states, n_repeat, dim):
@@ -1814,6 +1814,8 @@ def primed_sample(
                 )  # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
+                if fp16:
+                    hidden_states = hidden_states.half()
                 hidden_states = self.fc_proj_out(hidden_states)  # Predictions
                 if get_preds:
                     preds.append(hidden_states)
@@ -2221,8 +2223,6 @@ def conditioner_block(_level):
                     stride_t=config.cond_strides_t[_level],
                     **audio_conditioning_kwargs,
                 )
-
-            # if dist.get_rank() == 0: print(f"Conditioning on 1 above level(s)")
             self.conditioner_blocks.append(conditioner_block(self.cond_level))
 
         # metadata conditioning : contioning on timing, genres, and artist
@@ -2690,11 +2690,11 @@ def get_starts(total_length, n_ctx, hop_length):
 
 
 # FIXME, consumes too much RAM so should probably be removed
-def get_alignment(music_tokens, labels, prior, level, fp16, config):
+def get_alignment(music_tokens, labels, prior, fp16, config):
     """
     Should compute the lyric to music token alignment, but for now it cannot be used.
     """
-    level = level - 1  # Top level used
+    level = prior.levels - 1  # Top level used
     n_ctx = prior.n_ctx
     tokens = music_tokens[level]
     batch_size, total_length = tokens.shape[0], tokens.shape[1]
@@ -2714,7 +2714,7 @@ def get_alignment(music_tokens, labels, prior, level, fp16, config):
     indices_hops = {}
     prior.to(tokens.device)
     empty_cache()
-    for start in get_starts(total_length, n_ctx, hop_length):
+    for start in get_range(get_starts(total_length, n_ctx, hop_length)):
         end = start + n_ctx
 
         # set metadata offset, sample_length and lyrics tokens
@@ -2785,10 +2785,12 @@ def load_audio(file, sampling_rate, offset, duration, mono=False):
     return raw_audio
 
 
-def load_prompts(audio_files, duration, hps):
+def load_prompts(audio_files,hps, sample_length_in_seconds=70, offset_in_seconds=10 ):
+    duration = sample_length_in_seconds * hps.sampling_rate
+    offset  = offset_in_seconds * hps.sampling_rate
     raw_audio_list = []
     for audio_file in audio_files:
-        raw_audio = load_audio(audio_file, sampling_rate=hps.sampling_rate, duration=duration, offset=0.0, mono=True)
+        raw_audio = load_audio(audio_file, sampling_rate=hps.sampling_rate, duration=duration, offset=offset, mono=True)
         raw_audio = raw_audio.T  # CT -> TC
         raw_audio_list.append(raw_audio)
     while len(raw_audio_list) < hps.n_samples:
@@ -2923,6 +2925,7 @@ def _sample(
         offset=0,
         save_results=True,
         sample_length=None,
+        fp16 = False
     ):
         top_prior = self.priors[-1]
         total_length = (
@@ -2934,21 +2937,21 @@ def _sample(
         sampling_kwargs = [
             dict(
                 temp=0.99,
-                fp16=False,
+                fp16=fp16,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
             ),
             dict(
                 temp=0.99,
-                fp16=False,
+                fp16=fp16,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
-                sample_tokens=sample_tokens,
+                sample_tokens=sample_tokens
             ),
             dict(
                 temp=sampling_temperature,
-                fp16=False,
+                fp16=fp16,
                 max_batch_size=max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -2984,14 +2987,13 @@ def _sample(
                 logdir = f"{self.start_time}/level_{level}"
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
-                save_wav(logdir, level, metas=metas, aud=raw_audio, sampling_rate=self.config.sampling_rate)
+                save_wav(logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate)
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
                     empty_cache()
                     alignments = get_alignment(
                         music_tokens,
                         labels[-1],
                         self.priors[-1],
-                        level,
                         sampling_kwargs[-1]["fp16"],
                         self.config,
                     )
@@ -3025,8 +3027,28 @@ def primed_sample(self, raw_audio, labels, **sampling_kwargs):
         self.vqvae.to(raw_audio.device)
         with torch.no_grad():
             music_tokens = self.vqvae.encode(
-                raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.shape[0]
+                raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.sha@@pe[0]
             )
-        self.vqvae.to("cpu")
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
+
+# torch.utils.checkpoint.checkpoint
+
+# def create_custom_forward(module):
+#     def custom_forward(*inputs):
+#         return module(*inputs, past_key_value, output_attentions)
+
+#     return custom_forward
+
+# layer_outputs = torch.utils.checkpoint.checkpoint(
+#     create_custom_forward(layer_module),
+#     hidden_states,
+#     attention_mask,
+#     layer_head_mask,
+#     encoder_hidden_states,
+#     encoder_attention_mask,
+#     band_mask,
+#     from_mask,
+#     to_mask,
+#     blocked_encoder_mask,
+# )
\ No newline at end of file

From c3b6e5c202862184398dda17f711e128dd254a8f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 15 Aug 2022 12:34:02 +0000
Subject: [PATCH 095/196] update

---
 .../models/jukebox/modeling_jukebox.py        | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index cd4461c7741fa..cba1d2d6bf4c6 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2787,7 +2787,7 @@ def load_audio(file, sampling_rate, offset, duration, mono=False):
     return raw_audio
 
 
-def load_prompts(audio_files,hps, sample_length_in_seconds=70, offset_in_seconds=10 ):
+def load_prompts(audio_files,hps, sample_length_in_seconds=70, offset_in_seconds=10):
     duration = sample_length_in_seconds * hps.sampling_rate
     offset  = offset_in_seconds * hps.sampling_rate
     raw_audio_list = []
@@ -2797,9 +2797,9 @@ def load_prompts(audio_files,hps, sample_length_in_seconds=70, offset_in_seconds
         )
         raw_audio = raw_audio.T  # CT -> TC
         raw_audio_list.append(raw_audio)
-    while len(raw_audio_list) < n_samples:
+    while len(raw_audio_list) < len(audio_files):
         raw_audio_list.extend(raw_audio_list)
-    raw_audio_list = raw_audio_list[:n_samples]
+    raw_audio_list = raw_audio_list[:len(audio_files)]
     raw_audio = torch.stack([torch.from_numpy(raw_audio) for raw_audio in raw_audio_list])
     return raw_audio
 
@@ -2998,13 +2998,13 @@ def _sample(
                 save_wav(logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate)
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
                     empty_cache()
-                    alignments = get_alignment(
-                        music_tokens,
-                        labels[-1],
-                        self.priors[-1],
-                        sampling_kwargs[-1]["fp16"],
-                        self.config,
-                    )
+                    # alignments = get_alignment(
+                    #     music_tokens,
+                    #     labels[-1],
+                    #     self.priors[-1],
+                    #     sampling_kwargs[-1]["fp16"],
+                    #     self.config,
+                    # )
                     pass  # consumes too much ram
         return music_tokens
 
@@ -3035,7 +3035,7 @@ def primed_sample(self, raw_audio, labels, **sampling_kwargs):
         self.vqvae.to(raw_audio.device).float()
         with torch.no_grad():
             music_tokens = self.vqvae.encode(
-                raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.sha@@pe[0]
+                raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.shape[0]
             )
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens

From bd31d2463f1124e1a233888eda0f99a80ab80725 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 17 Aug 2022 16:22:12 +0000
Subject: [PATCH 096/196] clean and remove checkpointing

---
 .../models/jukebox/configuration_jukebox.py   |   4 -
 .../models/jukebox/modeling_jukebox.py        | 265 +++++++-----------
 2 files changed, 109 insertions(+), 160 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index de98aa222063b..e8c639fc83f90 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -41,8 +41,6 @@ class JukeboxConfig(PretrainedConfig):
     to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
 
     Args:
-
-
     Example:
 
     ```python
@@ -90,7 +88,6 @@ def __init__(
         cond_width=[128, 1024, 1024],
         cond_dilation_growth_rate=[1, 3, 3],
         cond_dilation_cycle=[None, 8, 8],
-        cond_c_res=[0, 1, 1],
         cond_res_scale=[None, True, False],
         cond_m_conv=1,
         cond_downs_t=(3, 2, 2),
@@ -180,7 +177,6 @@ def __init__(
         self.cond_width = cond_width
         self.cond_dilation_growth_rate = cond_dilation_growth_rate
         self.cond_dilation_cycle = cond_dilation_cycle
-        self.cond_c_res = cond_c_res
         self.cond_zero_out = cond_zero_out
         self.cond_m_conv = cond_m_conv
         self.cond_res_scale = cond_res_scale
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 853a7483ba64e..4bb3cd8a1da06 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -14,27 +14,16 @@
 # limitations under the License.
 """PyTorch Jukebox model."""
 
+import gc
 import math
 import os
+import sys
 import time
 
 import numpy as np
 import torch
 import torch.nn.functional as F
-import torch.utils.checkpoint
-from packaging import version
 from torch import nn
-
-
-if version.parse(torch.__version__) >= version.parse("1.6"):
-    is_amp_available = True
-    # from torch.cuda.amp import autocast
-else:
-    is_amp_available = False
-
-import gc
-import sys
-
 from tqdm import tqdm
 
 from ...activations import ACT2FN
@@ -51,18 +40,12 @@
 _TOKENIZER_FOR_DOC = "JukeboxTokenizer"
 
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "openai/jukebox-dummy",
     "openai/jukebox-1b-lyrics",
     "openai/jukebox-5b-lyrics",
     # See all Jukebox models at https://huggingface.co/models?filter=jukebox
 ]
 
 
-def empty_cache():
-    gc.collect()
-    torch.cuda.empty_cache()
-
-
 def get_range(list):
     return tqdm(
         list,
@@ -72,8 +55,6 @@ def get_range(list):
     )
 
 
-####################################################################
-# Attention and scalable transformer
 # Import FusedLayerNorm if we have apex, otherwise use regular LayerNorm
 try:
     from apex.normalization import FusedLayerNorm
@@ -108,7 +89,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class ResConv1DBlock(nn.Module):
+class JukeboxResConv1DBlock(nn.Module):
     def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scale=1.0):
         super().__init__()
         padding = dilation
@@ -126,7 +107,7 @@ def forward(self, hidden_states):
         return residuals + self.res_scale * hidden_states
 
 
-class Resnet1D(nn.Module):
+class JukeboxResnet1D(nn.Module):
     def __init__(
         self,
         n_in,
@@ -137,7 +118,6 @@ def __init__(
         zero_out=False,
         res_scale=False,
         reverse_dilation=False,
-        checkpoint_res=False,
     ):
         super().__init__()
 
@@ -150,7 +130,7 @@ def _get_depth(depth):
         blocks = []
         for depth in range(n_depth):
             blocks.append(
-                ResConv1DBlock(
+                JukeboxResConv1DBlock(
                     n_in,
                     int(m_conv * n_in),
                     dilation=dilation_growth_rate ** _get_depth(depth),
@@ -159,7 +139,6 @@ def _get_depth(depth):
                 )
             )
 
-        self.checkpoint_res = checkpoint_res
         if reverse_dilation:
             blocks = blocks[::-1]
         self.resnet_block = nn.ModuleList(blocks)
@@ -170,7 +149,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class EncoderConvBlock(nn.Module):
+class JukeboxEncoderConvBlock(nn.Module):
     def __init__(
         self,
         input_emb_width,
@@ -192,7 +171,7 @@ def __init__(
             for i in range(down_t):
                 blocks.append(nn.Conv1d(input_emb_width if i == 0 else width, width, filter_t, stride_t, pad_t))
                 blocks.append(
-                    Resnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out, res_scale)
+                    JukeboxResnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out, res_scale)
                 )
         self.proj_out = nn.Conv1d(width, output_emb_width, 3, 1, 1)
         self.downsample_block = nn.ModuleList(blocks)
@@ -204,7 +183,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class DecoderConvBock(nn.Module):
+class JukeboxDecoderConvBock(nn.Module):
     def __init__(
         self,
         input_emb_width,
@@ -219,7 +198,6 @@ def __init__(
         zero_out=False,
         res_scale=False,
         reverse_decoder_dilation=False,
-        checkpoint_res=False,
     ):
         super().__init__()
         blocks = []
@@ -228,7 +206,7 @@ def __init__(
             self.proj_in = nn.Conv1d(output_emb_width, width, 3, 1, 1)
             for i in range(down_t):
                 blocks.append(
-                    Resnet1D(
+                    JukeboxResnet1D(
                         width,
                         depth,
                         m_conv,
@@ -237,7 +215,6 @@ def __init__(
                         zero_out=zero_out,
                         res_scale=res_scale,
                         reverse_dilation=reverse_decoder_dilation,
-                        checkpoint_res=checkpoint_res,
                     )
                 )
                 blocks.append(
@@ -255,7 +232,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class Encoder(nn.Module):
+class JukeboxEncoder(nn.Module):
     def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t, **block_kwargs):
         super().__init__()
         self.input_emb_width = input_emb_width
@@ -269,7 +246,7 @@ def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t
             del block_kwargs_copy["reverse_decoder_dilation"]
 
         def level_block(level, down_t, stride_t):
-            return EncoderConvBlock(
+            return JukeboxEncoderConvBlock(
                 input_emb_width if level == 0 else output_emb_width,
                 output_emb_width,
                 down_t,
@@ -295,7 +272,7 @@ def forward(self, hidden_states):
         return all_hidden_states
 
 
-class Decoder(nn.Module):
+class JukeboxDecoder(nn.Module):
     def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t, **block_kwargs):
         super().__init__()
         self.input_emb_width = input_emb_width
@@ -305,7 +282,7 @@ def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t
         self.strides_t = strides_t
 
         def level_block(level, down_t, stride_t):
-            return DecoderConvBock(output_emb_width, output_emb_width, down_t, stride_t, **block_kwargs)
+            return JukeboxDecoderConvBock(output_emb_width, output_emb_width, down_t, stride_t, **block_kwargs)
 
         self.level_blocks = nn.ModuleList()
         iterator = zip(list(range(self.levels)), downs_t, strides_t)
@@ -599,7 +576,7 @@ def _block_kwargs(level):
             return this_block_kwargs
 
         def encoder(level):
-            return Encoder(
+            return JukeboxEncoder(
                 x_channels,
                 codebook_width,
                 level + 1,
@@ -609,7 +586,7 @@ def encoder(level):
             )
 
         def decoder(level):
-            return Decoder(
+            return JukeboxDecoder(
                 x_channels,
                 codebook_width,
                 level + 1,
@@ -710,9 +687,6 @@ def forward(self, raw_audio):
         return dequantised_state, loss
 
 
-# Jukebox autoregressive model and its building blocks
-
-
 class JukeboxMLP(nn.Module):
     def __init__(self, width, n_state, resid_dropout=0.0, afn="gelu", zero_out=False, init_scale=1.0):
         # a single channel is always used in original code
@@ -738,10 +712,8 @@ def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
 
     def forward(self, input):
         if input.numel() > self.max_numel:
-            # return F.layer_norm(input.float(), self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
             return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
         else:
-            # return super(JukeboxLayerNorm, self).forward(input.float()).type_as(input)
             return super(JukeboxLayerNorm, self).forward(input).type_as(input)
 
 
@@ -797,7 +769,6 @@ def __init__(
         mask=False,
         zero_out=False,
         init_scale=1.0,
-        checkpoint_attn=0,
         attn_func=0,
         blocks=None,
         spread=None,
@@ -839,7 +810,6 @@ def __init__(
         self.spread = spread
         if blocks is not None:
             self.block_ctx = n_ctx // blocks
-        self.checkpoint_attn = checkpoint_attn  # 0: None, 1: Attn after heads split, 2: Attn
 
         self.sample_t = 0
         self.cache = {}
@@ -912,7 +882,7 @@ def block_attn(self, query, key, value, sample):
             self.blocks,
             self.block_ctx,
         )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
-        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
             assert seq_len == self._suff_cache_len(), f"{seq_len} != {self._suff_cache_len()}"
             return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
@@ -932,11 +902,11 @@ def transpose_block_attn(self, query, key, value, sample):
             self.blocks,
             self.block_ctx,
         )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
-        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
-            block_l = (seq_len - 1) % block_ctx
-            key = key[:, block_l::block_ctx, :]
-            value = value[:, block_l::block_ctx, :]
+            block_len = (seq_len - 1) % block_ctx
+            key = key[:, block_len::block_ctx, :]
+            value = value[:, block_len::block_ctx, :]
             return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             query_length = query.shape[1]
@@ -971,7 +941,7 @@ def prev_block_attn(self, query, key, value, sample):
             self.blocks,
             self.block_ctx,
         )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
-        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
             assert seq_len == self._suff_cache_len(), f"{seq_len} != {self._suff_cache_len()}"
             block = (seq_len - 1) // block_ctx
@@ -1014,7 +984,7 @@ def summary_attn(self, query, key, value, sample):
             self.blocks,
             self.block_ctx,
         )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
-        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
             key = torch.nn.functional.pad(key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
             value = torch.nn.functional.pad(
@@ -1036,7 +1006,7 @@ def summary_spread_attn(self, query, key, value, sample):
             self.block_ctx,
             self.spread,
         )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
-        batch_size, seq_len, embed_dim = value.shape  # For sample, q_l = 1, k_l = v_l = sample_t
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
             assert False, "Not yet implemented"
             # key = torch.nn.functional.pad(k,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, embed_dim)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, embed_dim)
@@ -1238,8 +1208,6 @@ def __init__(
         res_scale=1.0,
         m_attn=0.25,
         m_mlp=1.0,
-        checkpoint_attn=0,
-        checkpoint_mlp=0,
         attn_func=0,
         blocks=None,
         spread=None,
@@ -1258,7 +1226,6 @@ def __init__(
             mask=mask,
             zero_out=zero_out,
             init_scale=init_scale,
-            checkpoint_attn=checkpoint_attn,
             attn_func=attn_func,
             blocks=blocks,
             spread=spread,
@@ -1278,10 +1245,6 @@ def __init__(
         self.layer_norm_1 = JukeboxLayerNorm(width)
         self.res_scale = res_scale
 
-        # TODO either support checkpointing for faster inference or get rid of this
-        self.checkpoint_attn = checkpoint_attn
-        self.checkpoint_mlp = checkpoint_mlp
-
         self.width = width
         self.attn_func = attn_func
 
@@ -1316,9 +1279,6 @@ def __init__(
         res_scale=False,
         m_attn=0.25,
         m_mlp=1.0,
-        checkpoint_attn=0,
-        checkpoint_mlp=0,
-        checkpoint_res=0,
         attn_order=0,
         blocks=None,
         spread=None,
@@ -1358,9 +1318,6 @@ def __init__(
             else [1, 2, 3][d % 3],  # Used by single_enc_dec model with lyrics
         }[attn_order]
 
-        # attn_cycle = {0: 1, 1: 2, 2: 3, 3: 2, 4: 2, 5: 4, 6: 4, 7: 16, 8: 10, 9: 4, 10: 79, 11: 16, 12: 16}[attn_order]
-        # assert n_depth % attn_cycle == 0, f'Depth {n_depth} not a multiple of cycle {attn_cycle} for attn_order {attn_order}'
-
         def attn_block(d):
             return JukeboxBlock(
                 width=width,
@@ -1376,8 +1333,6 @@ def attn_block(d):
                 res_scale=res_scale,
                 m_attn=m_attn,
                 m_mlp=m_mlp,
-                checkpoint_attn=checkpoint_attn,
-                checkpoint_mlp=checkpoint_mlp,
                 attn_func=attn_func(d),
                 blocks=blocks,
                 spread=spread,
@@ -1385,7 +1340,6 @@ def attn_block(d):
                 prime_len=prime_len,
             )
 
-        self.checkpoint_res = checkpoint_res
         self._attn_mods = nn.ModuleList()
         for d in range(n_depth):
             self._attn_mods.append(attn_block(d))
@@ -1476,9 +1430,6 @@ def __init__(
         pos_init=False,
         m_attn=0.25,
         m_mlp=1,
-        checkpoint_res=0,
-        checkpoint_attn=0,
-        checkpoint_mlp=0,
         attn_order=0,
         blocks=None,
         spread=None,
@@ -1538,9 +1489,6 @@ def __init__(
             res_scale=res_scale,
             m_attn=m_attn,
             m_mlp=m_mlp,
-            checkpoint_attn=checkpoint_attn,
-            checkpoint_mlp=checkpoint_mlp,
-            checkpoint_res=checkpoint_res,
             attn_order=attn_order,
             blocks=blocks,
             spread=spread,
@@ -1605,7 +1553,9 @@ def forward(
 
         target = tokens  # Target
         hidden_states = self.embed_tokens(tokens)  # music_tokens embedding
-        hidden_states = roll(hidden_states, 1)  # Shift by 1, and fill in start token
+        hidden_states = torch.cat(
+            (hidden_states[:, -1:], hidden_states[:, :-1]), dim=1
+        )  # Shift by 1, and fill in start token
         if self.metadata_conditioning:
             hidden_states[:, 0] = metadata_conditioning.view(N, self.width)
         else:
@@ -1724,6 +1674,11 @@ def sample(
         else:
             return tokens
 
+    def split_chunks(length, chunk_size):
+        n_passes = (length + chunk_size - 1) // chunk_size
+        chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
+        return chunk_sizes
+
     def primed_sample(
         self,
         n_samples,
@@ -1762,7 +1717,7 @@ def primed_sample(
             if chunk_size is None:
                 chunk_size = len(sampled_audio)
             # assert len(sampled_audio) % chunk_size == 0, f'expected {len(sampled_audio)} to be divisible by {chunk_size}'
-            chunk_sizes = split_chunks(len(sampled_audio), chunk_size)
+            chunk_sizes = self.split_chunks(len(sampled_audio), chunk_size)
             x_primes = []
             start = 0
             hidden_states = None
@@ -1778,7 +1733,7 @@ def primed_sample(
                     sampled_audio_prime.append(x_prime)
                     conds_prime.append(cond_prime)
                 start = start + current_chunk_size
-
+                # TODO rename x_prime, con_prime
                 x_prime, cond_prime = torch.cat(sampled_audio_prime, dim=1), torch.cat(conds_prime, dim=1)
                 del sampled_audio_prime
                 del conds_prime
@@ -1799,11 +1754,13 @@ def primed_sample(
                 x_prime = self.fc_proj_out(x_prime)  # Predictions
                 preds.append(x_prime)
 
-            empty_cache()
+            gc.collect()
+            torch.cuda.empty_cache()
             self.transformer.check_cache(n_samples, len(sampled_audio), fp16)
 
             hidden_states = sampled_audio[-1]
-            empty_cache()
+            gc.collect()
+            torch.cuda.empty_cache()
 
             for sample_t in get_range(range(len(sampled_audio), sample_tokens)):
                 hidden_states, cond = self.get_emb(
@@ -1811,7 +1768,7 @@ def primed_sample(
                 )
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 hidden_states = self.transformer(
-                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16, fp16_out = fp16
+                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16, fp16_out=fp16
                 )  # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
@@ -1880,22 +1837,10 @@ def get_normal(*shape, std=0.01):
     return w
 
 
-def roll(hidden_states, n):
-    return torch.cat((hidden_states[:, -n:], hidden_states[:, :-n]), dim=1)
-
-
-def split_chunks(length, chunk_size):
-    n_passes = (length + chunk_size - 1) // chunk_size
-    chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
-    assert sum(chunk_sizes) == length
-    return chunk_sizes
-
-
-# second most important renaming
-class MusicTokenConditioner(nn.Module):
+class JukeboxMusicTokenConditioner(nn.Module):
     """
-    The MusicTokenConditioner takes music tokens as an input (coresponding to vocabularies in the VQ-VAE codebook) and
-    upsamples it using a single layer of decoder convolution block (the same is used in the VQ-VAE).
+    The JukeboxMusicTokenConditioner takes music tokens as an input (coresponding to vocabularies in the VQ-VAE
+    codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQ-VAE).
 
     The embedding layer is different from the vaqvae's bottleneck
 
@@ -1909,8 +1854,8 @@ def __init__(
         self.embed_tokens = nn.Embedding(embed_dim, out_width)
         nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
 
-        # MusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level?
-        self.upsampler = DecoderConvBock(
+        # JukeboxMusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level?
+        self.upsampler = JukeboxDecoderConvBock(
             self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale
         )
         self.layer_norm = JukeboxLayerNorm(self.width)
@@ -1945,16 +1890,7 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
         return hidden_states
 
 
-def flip(hidden_states):
-    def _flip(hidden_states):
-        return hidden_states.permute(0, 2, 1).contiguous()
-
-    if isinstance(hidden_states, (list, tuple)):
-        return [flip(z) for z in hidden_states]
-    return _flip(hidden_states)
-
-
-class SimpleEmbedding(nn.Module):
+class JukeboxSimpleEmbedding(nn.Module):
     def __init__(self, embed_dim, out_width, init_scale):
         super().__init__()
         self.embed_dim = embed_dim
@@ -1964,7 +1900,7 @@ def forward(self, y):
         return self.emb(y)
 
 
-class RangeEmbedding(nn.Module):
+class JukeboxRangeEmbedding(nn.Module):
     # Interpolating
     # Interpolate so that [pos_start, pos_end] <-> position tensor of length n_ctx
     #
@@ -2036,8 +1972,8 @@ def __init__(
         # TODO rename bins
         bow_genre_bins, artist_bins = metadata_dims
         self.max_nb_genres = max_nb_genres
-        self.bow_genre_emb = SimpleEmbedding(bow_genre_bins, out_width, init_scale)
-        self.artist_emb = SimpleEmbedding(artist_bins, out_width, init_scale)
+        self.bow_genre_emb = JukeboxSimpleEmbedding(bow_genre_bins, out_width, init_scale)
+        self.artist_emb = JukeboxSimpleEmbedding(artist_bins, out_width, init_scale)
         self.include_time_signal = include_time_signal
         if self.include_time_signal:
             t_ranges = (
@@ -2047,9 +1983,11 @@ def __init__(
             )  # Relative pos
             assert len(t_ranges) == 3, f"Expecting (total, absolute, relative) ranges, got {t_ranges}"
             total_length_range, absolute_pos_range, relative_pos_range = t_ranges
-            self.total_length_emb = RangeEmbedding(1, timing_dims, total_length_range, out_width, init_scale)
-            self.absolute_pos_emb = RangeEmbedding(n_time, timing_dims, absolute_pos_range, out_width, init_scale)
-            self.relative_pos_emb = RangeEmbedding(
+            self.total_length_emb = JukeboxRangeEmbedding(1, timing_dims, total_length_range, out_width, init_scale)
+            self.absolute_pos_emb = JukeboxRangeEmbedding(
+                n_time, timing_dims, absolute_pos_range, out_width, init_scale
+            )
+            self.relative_pos_emb = JukeboxRangeEmbedding(
                 n_time, timing_dims, relative_pos_range, out_width, init_scale, clamp=True
             )
 
@@ -2087,7 +2025,7 @@ class JukeboxPrior(nn.Module):
     """
     Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
     on the timing, genre and artist, we use the LabelConditioner class To condition on the codes from the level above,
-    we use the MusicTokenConditioner class To condition on lyrics, we allow two types of priors:
+    we use the JukeboxMusicTokenConditioner class To condition on lyrics, we allow two types of priors:
     - Separate Encoder Decoder: This is the usual encoder-decoder style transformer. The encoder transformer
       autoregressively
     models the lyrics, and we use its last layer to produce keys/values that are attened to by the decoder transformer
@@ -2188,7 +2126,6 @@ def rescale(music_tokens_shape):
             dilation_cycle=config.cond_dilation_cycle[-level - 1],
             zero_out=config.cond_zero_out,
             res_scale=config.cond_res_scale[-level - 1],
-            checkpoint_res=config.cond_c_res[-level - 1],
         )  # have to keep this else names wrong
 
         metadata_conditioning_kwargs = dict(
@@ -2215,7 +2152,7 @@ def rescale(music_tokens_shape):
             self.conditioner_blocks = nn.ModuleList()
 
             def conditioner_block(_level):
-                return MusicTokenConditioner(
+                return JukeboxMusicTokenConditioner(
                     input_shape=music_tokens_shapes[_level],
                     embed_dim=config.prior_latent_dim,
                     down_t=config.cond_downs_t[_level],
@@ -2550,7 +2487,7 @@ def get_lyric_enc_loss(self, lyric_encoder_states, target_lyrics):
         Computes the loss for the lyric encoder, next token prediction.
         """
         if self.lyric_conditioning:
-            lyric_encoder_states = lyric_encoder_states.float()
+            # lyric_encoder_states = lyric_encoder_states.float()
             lyric_encoder_states = self.lyric_encoder.lm_head(lyric_encoder_states)
             lyric_enc_loss = nn.functional.cross_entropy(
                 lyric_encoder_states.view(-1, self.lyric_enc_dim), target_lyrics.view(-1)
@@ -2667,17 +2604,7 @@ def __init__(self, *inputs, **kwargs):
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
-
-def split_batch(obj, n_samples, split_size):
-    n_passes = (n_samples + split_size - 1) // split_size
-    if isinstance(obj, torch.Tensor):
-        return torch.split(obj, split_size, dim=0)
-    elif isinstance(obj, list):
-        return list(zip(*[torch.split(item, split_size, dim=0) for item in obj]))
-    elif obj is None:
-        return [None] * n_passes
-    else:
-        raise TypeError("Unknown input type")
+JUKEBOX_SAMPLE_INPUT_DOCSTRING = r""""""
 
 
 # Break total_length into hops/windows of size n_ctx separated by hop_length
@@ -2691,10 +2618,10 @@ def get_starts(total_length, n_ctx, hop_length):
     return starts
 
 
-# FIXME, consumes too much RAM so should probably be removed
+# NOTE, consumes a lot of RAM so should probably be ran on CPU
 def get_alignment(music_tokens, labels, prior, fp16, config):
     """
-    Should compute the lyric to music token alignment, but for now it cannot be used.
+    Compute the lyric to music token alignment, but for now it cannot be used.
     """
     level = prior.levels - 1  # Top level used
     n_ctx = prior.n_ctx
@@ -2714,18 +2641,21 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
     attn_layers = set([alignment_layer])
     alignment_hops = {}
     indices_hops = {}
-    prior.to(tokens.device)
-    empty_cache()
+    # prior.to(tokens.device)
+    prior.to("cpu")
+    gc.collect()
+    torch.cuda.empty_cache()
     for start in get_range(get_starts(total_length, n_ctx, hop_length)):
         end = start + n_ctx
-
         # set metadata offset, sample_length and lyrics tokens
         metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
-
+        metadata.to("cpu")
         tokens_bs = torch.chunk(tokens, batch_size, dim=0)
         metadata_bs = torch.chunk(metadata, batch_size, dim=0)
         w_hops = []
         for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
+            tokens_i = tokens_i.to("cpu")
+            metadata_i = metadata_i.to("cpu")
             w_hop = prior.forward_tokens(
                 tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers
             )
@@ -2741,7 +2671,8 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
         indices_hops[start] = indices_hop
         alignment_hops[start] = alignment_hop
     prior.cpu()
-    empty_cache()
+    gc.collect()
+    torch.cuda.empty_cache()
 
     # Combine attn for each hop into attn for full range
     # Use indices to place them into correct place for corresponding source tokens
@@ -2817,6 +2748,17 @@ def __init__(self, config):
         config.vqvae_music_tokens_shapes = self.vqvae.music_tokens_shapes
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
+    def split_batch(obj, n_samples, split_size):
+        n_passes = (n_samples + split_size - 1) // split_size
+        if isinstance(obj, torch.Tensor):
+            return torch.split(obj, split_size, dim=0)
+        elif isinstance(obj, list):
+            return list(zip(*[torch.split(item, split_size, dim=0) for item in obj]))
+        elif obj is None:
+            return [None] * n_passes
+        else:
+            raise TypeError("Unknown input type")
+
     # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
     def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample):
         prior = self.priors[level]
@@ -2870,13 +2812,14 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         # set metadata offset, sample_length and lyrics tokens
         metadata = prior.get_metadata(labels, start, self.total_length, offset)
 
-        empty_cache()
+        gc.collect()
+        torch.cuda.empty_cache()
         max_batch_size = sampling_kwargs["max_batch_size"]
         del sampling_kwargs["max_batch_size"]
 
-        music_tokens_list = split_batch(previous_sampled_tokens, n_samples, max_batch_size)
-        music_tokens_conds_list = split_batch(music_tokens_conds, n_samples, max_batch_size)
-        metadata_list = split_batch(metadata, n_samples, max_batch_size)
+        music_tokens_list = self.split_batch(previous_sampled_tokens, n_samples, max_batch_size)
+        music_tokens_conds_list = self.split_batch(music_tokens_conds, n_samples, max_batch_size)
+        metadata_list = self.split_batch(metadata, n_samples, max_batch_size)
         tokens = []
         for music_tokens_i, music_tokens_conds_i, metadata_i in zip(
             music_tokens_list, music_tokens_conds_list, metadata_list
@@ -2928,7 +2871,7 @@ def _sample(
         offset=0,
         save_results=True,
         sample_length=None,
-        fp16 = False
+        fp16=False,
     ):
         top_prior = self.priors[-1]
         total_length = (
@@ -2964,13 +2907,13 @@ def _sample(
         if sample_levels is None:
             sample_levels = range(len(self.priors))
 
-        self.total_length = total_length # total length of the signal, might be bit different
+        self.total_length = total_length  # total length of the signal, might be bit different
         for level in reversed(sample_levels):
-             
 
             # from the actual generated length
             self.priors[level].to(music_tokens[level].device).eval()
-            empty_cache()
+            gc.collect()
+            torch.cuda.empty_cache()
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
             # self.priors[level].total_length = total_length // self.priors[level].raw_to_tokens
             total_token_to_sample = total_length // self.priors[level].raw_to_tokens
@@ -2981,7 +2924,8 @@ def _sample(
             )
 
             self.priors[level].to("cpu")
-            empty_cache()
+            gc.collect()
+            torch.cuda.empty_cache()
             self.vqvae.to(music_tokens[level].device)
             # Decode sample
             with torch.no_grad():
@@ -2996,15 +2940,18 @@ def _sample(
                     os.makedirs(logdir)
                 save_wav(logdir, level, metas=metas, aud=raw_audio, sampling_rate=self.config.sampling_rate)
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
-                    empty_cache()
-                    # alignments = get_alignment(
-                    #     music_tokens,
-                    #     labels[-1],
-                    #     self.priors[-1],
-                    #     level,
-                    #     sampling_kwargs[-1]["fp16"],
-                    #     self.config,
-                    # )
+                    gc.collect()
+                    torch.cuda.empty_cache()
+                    # should memory profile this function it takes up to 77% of 88GB RAM which seems to be a loooooooot 96%
+                    # has to be either a memory leak or, since all the attention is stored, everything is stored and it
+                    # is not optimal
+                    alignments = get_alignment(
+                        music_tokens,
+                        labels[-1],
+                        self.priors[-1],
+                        sampling_kwargs[-1]["fp16"],
+                        self.config,
+                    )
                     pass  # consumes too much ram
         return music_tokens
 
@@ -3025,7 +2972,7 @@ def continue_sample(self, music_tokens, labels, **sampling_kwargs):
 
     # Upsample given already generated upper-level codes
     def upsample(self, music_tokens, labels, **sampling_kwargs):
-        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors)-1)))
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors) - 1)))
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
 
@@ -3041,5 +2988,11 @@ def primed_sample(self, raw_audio, labels, **sampling_kwargs):
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
 
+
 # TODO add tied embeddings for the lyric encoder lm head as well as the proj_out when they are not seperated.
-# TODO should support cehckpointing attention as it is faster 
\ No newline at end of file
+# TODO should support cehckpointing attention as it is faster
+
+# Training the prior on next token prediction using a bert tokenizer would make more sens than only predicting the letter
+# Indeed the model in unconditional sampling does not generate proper lyrics. Thus should have been
+# That is why we hear jibbrish. Could also have 3 levels of encoding were you predict entires sentences corresponding to the
+# highest level, then lower level words etc

From 00c614cb00ac1e6450346b433ad40a7c844a4646 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 17 Aug 2022 16:29:13 +0000
Subject: [PATCH 097/196] clean renamed comments

---
 .../models/jukebox/modeling_jukebox.py          | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 18f0263bda01d..ec76c90913e13 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1411,7 +1411,6 @@ def forward(self):
         return pos_emb
 
 
-# Most important renaming has to happen here
 class JukeboxConditionalAutoregressive(nn.Module):
     def __init__(
         self,
@@ -1806,7 +1805,6 @@ def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
         top_k >0: keep only top key tokens with highest probability (top-k filtering).
         top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
     """
-    # assert logits.dim() == 2  # batch size 1 for now - could be updated for more but the code would be less clear
     logits = logits.clone()
     top_k = min(top_k, logits.size(-1))  # Safety check
     assert (top_k == 0) or (top_p == 0.0)
@@ -2133,8 +2131,8 @@ def rescale(music_tokens_shape):
         metadata_conditioning_kwargs = dict(
             out_width=config.prior_width[-level - 1],
             init_scale=config.prior_init_scale[-level - 1],
-            metadata_dims=config.metadata_dims[-level - 1],  # rename to metadata_dims
-            timing_dims=config.timing_dims,  # rename to timing_dims or timing_intervals
+            metadata_dims=config.metadata_dims[-level - 1],  
+            timing_dims=config.timing_dims,   
             sampling_rate=config.sampling_rate,
             min_duration=config.min_duration,
             max_duration=config.max_duration,
@@ -2161,6 +2159,7 @@ def conditioner_block(_level):
                     stride_t=config.cond_strides_t[_level],
                     **audio_conditioning_kwargs,
                 )
+
             self.conditioner_blocks.append(conditioner_block(self.cond_level))
 
         # metadata conditioning : contioning on timing, genres, and artist
@@ -2718,9 +2717,9 @@ def load_audio(file, sampling_rate, offset, duration, mono=False):
     return raw_audio
 
 
-def load_prompts(audio_files,hps, sample_length_in_seconds=70, offset_in_seconds=10):
+def load_prompts(audio_files, hps, sample_length_in_seconds=70, offset_in_seconds=10):
     duration = sample_length_in_seconds * hps.sampling_rate
-    offset  = offset_in_seconds * hps.sampling_rate
+    offset = offset_in_seconds * hps.sampling_rate
     raw_audio_list = []
     for audio_file in audio_files:
         raw_audio = load_audio(
@@ -2730,12 +2729,10 @@ def load_prompts(audio_files,hps, sample_length_in_seconds=70, offset_in_seconds
         raw_audio_list.append(raw_audio)
     while len(raw_audio_list) < len(audio_files):
         raw_audio_list.extend(raw_audio_list)
-    raw_audio_list = raw_audio_list[:len(audio_files)]
+    raw_audio_list = raw_audio_list[: len(audio_files)]
     raw_audio = torch.stack([torch.from_numpy(raw_audio) for raw_audio in raw_audio_list])
     return raw_audio
 
-
-# a little bit of renaming to do here, especially regarind "z"
 @add_start_docstrings(
     "The bare JUKEBOX Model from which you can sample",
     JUKEBOX_START_DOCSTRING,
@@ -2894,7 +2891,7 @@ def _sample(
                 fp16=fp16,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
-                sample_tokens=sample_tokens
+                sample_tokens=sample_tokens,
             ),
             dict(
                 temp=sampling_temperature,

From fa622e988f6fa1f40186eac53d815f3c5592e2b2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 17 Aug 2022 16:29:29 +0000
Subject: [PATCH 098/196] style

---
 src/transformers/models/jukebox/modeling_jukebox.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index ec76c90913e13..595f2db35e687 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2131,8 +2131,8 @@ def rescale(music_tokens_shape):
         metadata_conditioning_kwargs = dict(
             out_width=config.prior_width[-level - 1],
             init_scale=config.prior_init_scale[-level - 1],
-            metadata_dims=config.metadata_dims[-level - 1],  
-            timing_dims=config.timing_dims,   
+            metadata_dims=config.metadata_dims[-level - 1],
+            timing_dims=config.timing_dims,
             sampling_rate=config.sampling_rate,
             min_duration=config.min_duration,
             max_duration=config.max_duration,
@@ -2733,6 +2733,7 @@ def load_prompts(audio_files, hps, sample_length_in_seconds=70, offset_in_second
     raw_audio = torch.stack([torch.from_numpy(raw_audio) for raw_audio in raw_audio_list])
     return raw_audio
 
+
 @add_start_docstrings(
     "The bare JUKEBOX Model from which you can sample",
     JUKEBOX_START_DOCSTRING,

From d55a516b677efd7dd1b775e170e6808542adb7f1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 18 Aug 2022 12:48:15 +0000
Subject: [PATCH 099/196] rename prime ->  lyric_enc

---
 .../models/jukebox/configuration_jukebox.py   |  70 +++++-----
 .../models/jukebox/modeling_jukebox.py        | 129 ++++++++----------
 2 files changed, 94 insertions(+), 105 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index e8c639fc83f90..6533dc08ba575 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -79,7 +79,6 @@ def __init__(
         nb_relevant_lyric_tokens=[384, 0, 0],
         min_duration=17.84,
         max_duration=600.0,
-        fp16_params=True,
         max_nb_genres=5,
         init_std=0.2,
         hop_fraction=[0.125, 0.5, 0.5],
@@ -92,23 +91,23 @@ def __init__(
         cond_m_conv=1,
         cond_downs_t=(3, 2, 2),
         cond_strides_t=(2, 2, 2),
-        prime_spread=None,
-        prime_width=[128, 128, 128],
-        prime_depth=[18, 3, 3],
-        prime_heads=4,
-        prime_m_attn=0.25,
-        prime_m_mlp=1.0,
-        prime_blocks=32,
-        prime_init_scale=[0.1, 0.4, 0.4],
-        prime_loss_fraction=[0.4, 0.0, 0.0],
-        prime_attn_order=[2, 0, 0],
-        prime_attn_dropout=0.0,
-        prime_resid_dropout=0.0,
-        prime_emb_dropout=0.0,
-        prime_zero_out=False,
-        prime_res_scale=False,
-        prime_pos_init=False,
-        prime_n_vocab=79,
+        lyric_enc_spread=None,
+        lyric_enc_width=[128, 128, 128],
+        lyric_enc_depth=[18, 3, 3],
+        lyric_enc_heads=4,
+        lyric_enc_m_attn=0.25,
+        lyric_enc_m_mlp=1.0,
+        lyric_enc_blocks=32,
+        lyric_enc_init_scale=[0.1, 0.4, 0.4],
+        lyric_enc_loss_fraction=[0.4, 0.0, 0.0],
+        lyric_enc_attn_order=[2, 0, 0],
+        lyric_enc_attn_dropout=0.0,
+        lyric_enc_resid_dropout=0.0,
+        lyric_enc_emb_dropout=0.0,
+        lyric_enc_zero_out=False,
+        lyric_enc_res_scale=False,
+        lyric_enc_pos_init=False,
+        lyric_enc_n_vocab=79,
         prior_init_scale=[0.2, 1, 1],
         prior_spread=None,
         prior_zero_out=False,
@@ -146,7 +145,6 @@ def __init__(
         **kwargs,
     ):
 
-        self.fp16_params = fp16_params
         self.init_std = init_std
         self.copy_input = copy_input
         self.nb_priors = nb_priors
@@ -198,23 +196,23 @@ def __init__(
         self.lyric_conditioning = lyric_conditioning
         self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
 
-        self.prime_attn_dropout = prime_attn_dropout
-        self.prime_attn_order = prime_attn_order
-        self.prime_blocks = prime_blocks
-        self.prime_depth = prime_depth
-        self.prime_emb_dropout = prime_emb_dropout
-        self.prime_heads = prime_heads
-        self.prime_init_scale = prime_init_scale
-        self.prime_loss_fraction = prime_loss_fraction
-        self.prime_m_attn = prime_m_attn
-        self.prime_m_mlp = prime_m_mlp
-        self.prime_pos_init = prime_pos_init
-        self.prime_resid_dropout = prime_resid_dropout
-        self.prime_res_scale = prime_res_scale
-        self.prime_spread = prime_spread
-        self.prime_width = prime_width
-        self.prime_zero_out = prime_zero_out
-        self.prime_n_vocab = prime_n_vocab
+        self.lyric_enc_attn_dropout = lyric_enc_attn_dropout
+        self.lyric_enc_attn_order = lyric_enc_attn_order
+        self.lyric_enc_blocks = lyric_enc_blocks
+        self.lyric_enc_depth = lyric_enc_depth
+        self.lyric_enc_emb_dropout = lyric_enc_emb_dropout
+        self.lyric_enc_heads = lyric_enc_heads
+        self.lyric_enc_init_scale = lyric_enc_init_scale
+        self.lyric_enc_loss_fraction = lyric_enc_loss_fraction
+        self.lyric_enc_m_attn = lyric_enc_m_attn
+        self.lyric_enc_m_mlp = lyric_enc_m_mlp
+        self.lyric_enc_pos_init = lyric_enc_pos_init
+        self.lyric_enc_resid_dropout = lyric_enc_resid_dropout
+        self.lyric_enc_res_scale = lyric_enc_res_scale
+        self.lyric_enc_spread = lyric_enc_spread
+        self.lyric_enc_width = lyric_enc_width
+        self.lyric_enc_zero_out = lyric_enc_zero_out
+        self.lyric_enc_n_vocab = lyric_enc_n_vocab
 
         # VQVAE parameters (all used)
         self.vqvae_levels = vqvae_levels
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 595f2db35e687..971de8cbd4ee6 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -773,7 +773,7 @@ def __init__(
         blocks=None,
         spread=None,
         encoder_dims=None,
-        prime_len=None,
+        lyric_enc_len=None,
     ):
         super().__init__()
         self.width = width  # should have a better name
@@ -814,7 +814,7 @@ def __init__(
         self.sample_t = 0
         self.cache = {}
         self.encoder_dims = encoder_dims
-        self.prime_len = prime_len
+        self.lyric_enc_len = lyric_enc_len
         self.record_attn = False
         self.w = None
 
@@ -847,7 +847,7 @@ def _attn(self, query_states, key_states, value_states, sample):
             self.attention_prob = attention_prob
             if self.attn_func == 7:
                 # only keep music queries and lyrics keys/values
-                self.attention_prob = self.attention_prob[:, :, self.prime_len :, : self.prime_len]
+                self.attention_prob = self.attention_prob[:, :, self.lyric_enc_len :, : self.lyric_enc_len]
         attention_prob = self.attn_dropout(attention_prob)
         context_states = torch.matmul(attention_prob, value_states)
         return context_states
@@ -1031,9 +1031,9 @@ def summary_spread_attn(self, query, key, value, sample):
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def prime_attn(self, query, key, value, sample):
-        prime_len = self._prime_len
-        key = key[:, :prime_len]
-        value = value[:, :prime_len]
+        lyric_enc_len = self._lyric_enc_len
+        key = key[:, :lyric_enc_len]
+        value = value[:, :lyric_enc_len]
         return self.dense_attn(query, key, value, sample)
 
     def decode_attn(self, query, key, value, sample):
@@ -1068,10 +1068,10 @@ def prime_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         assert lyric_encoder_states is None
         query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
-            if self._cache_len() < self._prime_len:
+            if self._cache_len() < self._lyric_enc_len:
                 self._append_cache(key, value)
-            if self._cache_len() > self._prime_len:
-                self._slice_cache(0, self._prime_len)
+            if self._cache_len() > self._lyric_enc_len:
+                self._slice_cache(0, self._lyric_enc_len)
             key, value = self.cache["key"], self.cache["value"]
             self.sample_t += curr_ctx
         return query, key, value, sample
@@ -1102,10 +1102,10 @@ def forward(self, hidden_states, lyric_encoder_states=None, sample=False):
         return self.resid_dropout(a)
 
     @property
-    def _prime_len(self):
-        prime_len = self.prime_len
-        prime_blocks = (prime_len // self.blocks) + 1
-        return prime_blocks * self.blocks
+    def _lyric_enc_len(self):
+        lyric_enc_len = self.lyric_enc_len
+        lyric_enc_blocks = (lyric_enc_len // self.blocks) + 1
+        return lyric_enc_blocks * self.blocks
 
     def _offset(self, curr_ctx):
         if self.attn_func == 0:
@@ -1147,7 +1147,7 @@ def _suff_cache_len(self):
         elif self.attn_func == 6:
             return self.encoder_dims
         elif self.attn_func == 7:
-            return min(self.sample_t, self._prime_len)
+            return min(self.sample_t, self._lyric_enc_len)
         else:
             raise NotImplementedError()
 
@@ -1212,7 +1212,7 @@ def __init__(
         blocks=None,
         spread=None,
         encoder_dims=None,
-        prime_len=None,
+        lyric_enc_len=None,
     ):
         super().__init__()
         self.attn = JukeboxAttention(
@@ -1230,7 +1230,7 @@ def __init__(
             blocks=blocks,
             spread=spread,
             encoder_dims=encoder_dims,
-            prime_len=prime_len,
+            lyric_enc_len=lyric_enc_len,
         )
 
         self.layer_norm_0 = JukeboxLayerNorm(width)
@@ -1283,7 +1283,7 @@ def __init__(
         blocks=None,
         spread=None,
         encoder_dims=None,
-        prime_len=None,
+        lyric_enc_len=None,
     ):
         super().__init__()
         self.width = width
@@ -1292,7 +1292,7 @@ def __init__(
         self.blocks = blocks
         if blocks is not None:
             self.block_ctx = n_ctx // blocks
-        self.prime_len = prime_len
+        self.lyric_enc_len = lyric_enc_len
         self.num_heads = num_heads
 
         res_scale = 1.0 / n_depth if res_scale else 1.0
@@ -1337,7 +1337,7 @@ def attn_block(d):
                 blocks=blocks,
                 spread=spread,
                 encoder_dims=encoder_dims,
-                prime_len=prime_len,
+                lyric_enc_len=lyric_enc_len,
             )
 
         self._attn_mods = nn.ModuleList()
@@ -1437,7 +1437,7 @@ def __init__(
         encoder_dims=0,
         only_encode=False,
         merged_decoder=False,
-        prime_len=None,
+        lyric_enc_len=None,
         afn="quick_gelu",
     ):
         """
@@ -1450,7 +1450,7 @@ def __init__(
         - metadata_conditioning : whether or not the prior supports conditionning on artitst, genres, lyrics and
           timing. When
         False, the start token is random.
-        - prime_len : for now len of the lyric hidden states
+        - lyric_enc_len : for now len of the lyric hidden states
         """
         super().__init__()
         self.input_shape = input_shape
@@ -1492,12 +1492,10 @@ def __init__(
             blocks=blocks,
             spread=spread,
             encoder_dims=encoder_dims,
-            prime_len=prime_len,
+            lyric_enc_len=lyric_enc_len,
         )
-        # TODO rename prime_len
         self.only_encode = only_encode
-        self.prime_len = prime_len
-        # TODO rename fc_pro_out to LM head an probably use HF's linking trick
+        self.lyric_enc_len = lyric_enc_len
         if merged_decoder:
             # Merged piped model uses this setup
             self.add_cond_after_transformer = False
@@ -1576,15 +1574,13 @@ def forward(
         hidden_states = self.fc_proj_out(hidden_states)  # Predictions
 
         if get_sep_loss:
-            # TODO rename x_prime and x_gen. Prime is related to primed sampling
-            # TODO rename prime_length, prime_loss  (related au primed_sample)
-            x_prime = hidden_states[:, : self.prime_len].reshape(-1, self.embed_dim)
-            x_gen = hidden_states[:, self.prime_len :].reshape(-1, self.embed_dim)
+            lyric_hidden_states = hidden_states[:, : self.lyric_enc_len].reshape(-1, self.embed_dim)
+            token_hidden_states = hidden_states[:, self.lyric_enc_len :].reshape(-1, self.embed_dim)
 
-            prime_loss = F.cross_entropy(x_prime, target[:, : self.prime_len].reshape(-1)) / np.log(2.0)
-            gen_loss = F.cross_entropy(x_gen, target[:, self.prime_len :].reshape(-1)) / np.log(2.0)
+            lyric_loss = F.cross_entropy(lyric_hidden_states, target[:, : self.lyric_enc_len].reshape(-1)) / np.log(2.0)
+            music_token_loss = F.cross_entropy(token_hidden_states, target[:, self.lyric_enc_len :].reshape(-1)) / np.log(2.0)
 
-            loss = (prime_loss, gen_loss)  # Note order! Prime is first
+            loss = (lyric_loss, music_token_loss)  # Note order! Lyric is first
         else:
             loss = F.cross_entropy(hidden_states.view(-1, self.embed_dim), target.view(-1)) / np.log(2.0)  # Loss
 
@@ -1678,6 +1674,7 @@ def split_chunks(length, chunk_size):
         chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
         return chunk_sizes
 
+    # FIXME TODO last function needing renaming
     def primed_sample(
         self,
         n_samples,
@@ -1715,7 +1712,6 @@ def primed_sample(
             # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage.
             if chunk_size is None:
                 chunk_size = len(sampled_audio)
-            # assert len(sampled_audio) % chunk_size == 0, f'expected {len(sampled_audio)} to be divisible by {chunk_size}'
             chunk_sizes = self.split_chunks(len(sampled_audio), chunk_size)
             x_primes = []
             start = 0
@@ -1872,7 +1868,7 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
         """
         Args :
             - music_tokens : int or long, in range(codebook_dim)
-            - raw_audio_conditionning : used when prime sampling, raw audio information that conditions
+            - raw_audio_conditionning : used when primed sampling, raw audio information that conditions
             the generation
         """
         if raw_audio_conditionning is None:
@@ -1969,11 +1965,10 @@ def __init__(
         super().__init__()
         self.n_time = n_time
         self.out_width = out_width
-        # TODO rename bins
-        bow_genre_bins, artist_bins = metadata_dims
+        nb_genres, nb_artists = metadata_dims
         self.max_nb_genres = max_nb_genres
-        self.bow_genre_emb = JukeboxSimpleEmbedding(bow_genre_bins, out_width, init_scale)
-        self.artist_emb = JukeboxSimpleEmbedding(artist_bins, out_width, init_scale)
+        self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width, init_scale)
+        self.artist_emb = JukeboxSimpleEmbedding(nb_artists, out_width, init_scale)
         self.include_time_signal = include_time_signal
         if self.include_time_signal:
             t_ranges = (
@@ -2032,8 +2027,7 @@ class JukeboxPrior(nn.Module):
     - Single Encoder Decoder: This is a simplification where we combine them into a single model. We merge the text
       vocab
     and VQ vocab into a single large vocab, and the lyric tokens and VQ tokens into a single longer sequence of tokens
-    which we autoregressively model together. # TODO this explains the input bins that are different from the lower lvl
-    transformers.
+    which we autoregressively model together. 
 
     Question : why are the embeddings from the vq-vae not used? Or am I crazy? In the forward it is used, but not in
     the primed sample or sample functions. If the model is not trained using these/ uses the forward differently then I
@@ -2056,9 +2050,7 @@ def rescale(music_tokens_shape):
         music_tokens_shapes = [rescale(music_tokens_shape) for music_tokens_shape in vqvae_music_tokens_shapes]
         self.lyric_conditioning = config.lyric_conditioning[-level - 1]
         self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens[-level - 1]
-
-        # TODO rename prime loss fraction
-        self.prime_loss_fraction = config.prime_loss_fraction[-level - 1]
+        self.lyric_enc_loss_fraction = config.lyric_enc_loss_fraction[-level - 1]
 
         self.copy_input = config.copy_input
         if self.copy_input:
@@ -2093,28 +2085,27 @@ def rescale(music_tokens_shape):
         )
 
         if config.lyric_conditioning and not config.single_enc_dec[-level - 1]:
-            # TODO rename to encoder_kwargs as they are used both
-            # when single and not
+            # lyric_enc -> lyric_enc
             lyric_enc_kwargs = dict(
-                embed_dim=config.prime_n_vocab,  # previously bins
-                width=config.prime_width[-level - 1],
-                depth=config.prime_depth[-level - 1],
-                heads=config.prime_heads,
-                attn_order=config.prime_attn_order[-level - 1],
-                blocks=config.prime_blocks,
-                spread=config.prime_spread,
-                attn_dropout=config.prime_attn_dropout,
-                resid_dropout=config.prime_resid_dropout,
-                emb_dropout=config.prime_emb_dropout,
-                zero_out=config.prime_zero_out,
-                res_scale=config.prime_res_scale,
-                pos_init=config.prime_pos_init,
-                init_scale=config.prime_init_scale[-level - 1],
-                m_attn=config.prime_m_attn,
-                m_mlp=config.prime_m_mlp,
+                embed_dim=config.lyric_enc_n_vocab,  # previously bins
+                width=config.lyric_enc_width[-level - 1],
+                depth=config.lyric_enc_depth[-level - 1],
+                heads=config.lyric_enc_heads,
+                attn_order=config.lyric_enc_attn_order[-level - 1],
+                blocks=config.lyric_enc_blocks,
+                spread=config.lyric_enc_spread,
+                attn_dropout=config.lyric_enc_attn_dropout,
+                resid_dropout=config.lyric_enc_resid_dropout,
+                emb_dropout=config.lyric_enc_emb_dropout,
+                zero_out=config.lyric_enc_zero_out,
+                res_scale=config.lyric_enc_res_scale,
+                pos_init=config.lyric_enc_pos_init,
+                init_scale=config.lyric_enc_init_scale[-level - 1],
+                m_attn=config.lyric_enc_m_attn,
+                m_mlp=config.lyric_enc_m_mlp,
             )
         else:
-            lyric_enc_kwargs = dict(embed_dim=config.prime_n_vocab)
+            lyric_enc_kwargs = dict(embed_dim=config.lyric_enc_n_vocab)
 
         audio_conditioning_kwargs = dict(
             out_width=config.prior_width[-level - 1],
@@ -2177,7 +2168,7 @@ def conditioner_block(_level):
             self.prior_embed_dim_shift = np.cumsum([0, *self.prior_embed_dim])[:-1]
             self.prior_width = prior_kwargs["width"]
 
-            # lyrics_enc_loss_dims was the prime loss dims, gen is for the generated tokens.
+            # lyrics_enc_loss_dims was the lyric_enc loss dims, gen is for the generated tokens.
             # what is the shape of the lyrics loss?
 
             self.lyrics_enc_loss_dims, self.gen_loss_dims = self.prior_dims[0], self.prior_dims[1]
@@ -2187,7 +2178,7 @@ def conditioner_block(_level):
                 embed_dim=sum(self.prior_embed_dim),
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
                 metadata_conditioning=True,
-                prime_len=self.lyrics_enc_loss_dims,
+                lyric_enc_len=self.lyrics_enc_loss_dims,
                 **prior_kwargs,
             )
 
@@ -2517,12 +2508,12 @@ def forward_tokens(
             tokens, audio_conditioning = self.prior_preprocess(
                 [lyric_tokens, music_tokens], [None, audio_conditioning]
             )
-            (prime_loss, gen_loss), preds = self.prior(
+            (lyric_enc_loss, gen_loss), preds = self.prior(
                 tokens, audio_conditioning, metadata_conditioning, fp16=fp16, get_sep_loss=True, get_preds=get_preds
             )
         else:
             lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens, fp16=fp16)
-            prime_loss = self.get_lyric_enc_loss(lyric_encoder_states, lyric_tokens)
+            lyric_enc_loss = self.get_lyric_enc_loss(lyric_encoder_states, lyric_tokens)
             gen_loss, preds = self.prior(
                 music_tokens,
                 audio_conditioning,
@@ -2531,11 +2522,11 @@ def forward_tokens(
                 fp16=fp16,
                 get_preds=get_preds,
             )
-        loss = (self.prime_loss_fraction * prime_loss * self.lyrics_enc_loss_dims / self.total_loss_dims) + (
+        loss = (self.lyric_enc_loss_fraction * lyric_enc_loss * self.lyrics_enc_loss_dims / self.total_loss_dims) + (
             gen_loss * self.gen_loss_dims / self.total_loss_dims
         )
         metrics = dict(
-            bpd=gen_loss.clone().detach(), prime_loss=prime_loss.clone().detach(), gen_loss=gen_loss.clone().detach()
+            bpd=gen_loss.clone().detach(), lyric_enc_loss=lyric_enc_loss.clone().detach(), gen_loss=gen_loss.clone().detach()
         )
         if get_preds:
             metrics["preds"] = preds.clone().detach()
@@ -2988,7 +2979,7 @@ def primed_sample(self, raw_audio, labels, **sampling_kwargs):
 
 
 # TODO add tied embeddings for the lyric encoder lm head as well as the proj_out when they are not seperated.
-# TODO should support cehckpointing attention as it is faster
+
 
 # Training the prior on next token prediction using a bert tokenizer would make more sens than only predicting the letter
 # Indeed the model in unconditional sampling does not generate proper lyrics. Thus should have been

From 30a7b0dd1ab9d45225451a31f14f696ca9aa66a6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 18 Aug 2022 12:48:32 +0000
Subject: [PATCH 100/196] style

---
 .../models/jukebox/modeling_jukebox.py             | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 971de8cbd4ee6..9a08e8f2f0d60 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1577,8 +1577,12 @@ def forward(
             lyric_hidden_states = hidden_states[:, : self.lyric_enc_len].reshape(-1, self.embed_dim)
             token_hidden_states = hidden_states[:, self.lyric_enc_len :].reshape(-1, self.embed_dim)
 
-            lyric_loss = F.cross_entropy(lyric_hidden_states, target[:, : self.lyric_enc_len].reshape(-1)) / np.log(2.0)
-            music_token_loss = F.cross_entropy(token_hidden_states, target[:, self.lyric_enc_len :].reshape(-1)) / np.log(2.0)
+            lyric_loss = F.cross_entropy(lyric_hidden_states, target[:, : self.lyric_enc_len].reshape(-1)) / np.log(
+                2.0
+            )
+            music_token_loss = F.cross_entropy(
+                token_hidden_states, target[:, self.lyric_enc_len :].reshape(-1)
+            ) / np.log(2.0)
 
             loss = (lyric_loss, music_token_loss)  # Note order! Lyric is first
         else:
@@ -2027,7 +2031,7 @@ class JukeboxPrior(nn.Module):
     - Single Encoder Decoder: This is a simplification where we combine them into a single model. We merge the text
       vocab
     and VQ vocab into a single large vocab, and the lyric tokens and VQ tokens into a single longer sequence of tokens
-    which we autoregressively model together. 
+    which we autoregressively model together.
 
     Question : why are the embeddings from the vq-vae not used? Or am I crazy? In the forward it is used, but not in
     the primed sample or sample functions. If the model is not trained using these/ uses the forward differently then I
@@ -2526,7 +2530,9 @@ def forward_tokens(
             gen_loss * self.gen_loss_dims / self.total_loss_dims
         )
         metrics = dict(
-            bpd=gen_loss.clone().detach(), lyric_enc_loss=lyric_enc_loss.clone().detach(), gen_loss=gen_loss.clone().detach()
+            bpd=gen_loss.clone().detach(),
+            lyric_enc_loss=lyric_enc_loss.clone().detach(),
+            gen_loss=gen_loss.clone().detach(),
         )
         if get_preds:
             metrics["preds"] = preds.clone().detach()

From e1f7376dafdcec5f69433afe0eb5a34df9a77a8a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 18 Aug 2022 13:33:29 +0000
Subject: [PATCH 101/196] Start docstring

---
 .../models/jukebox/configuration_jukebox.py   | 157 +++++++++++++++++-
 .../models/jukebox/modeling_jukebox.py        |   6 -
 2 files changed, 154 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 6533dc08ba575..391f986b24411 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -41,6 +41,160 @@ class JukeboxConfig(PretrainedConfig):
     to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
 
     Args:
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            Sampling rate of the raw audio.
+        metadata_dims (`list`, *optional*, defaults to [(604, 7898), (120, 4111), (120, 4111)]):
+            List containing the number of genres and the number of artists that were used to train the 
+            embedding layers of each of the prior models. 
+        nb_priors (`int`, *optional*, defaults to 3):
+            Number of prior models that will sequentialy sample tokens. Each prior is conditional auto regressive (decoder) model, 
+            apart from the top prior, which can include a lyric encoder. The available models were trained using a top prior and 
+            2 upsampler priors.
+        timing_dims (`int`, *optional*, defaults to 64):
+            Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding layer. 
+            #TODO the timing embedding layer converts the absolute and relative position in the currently sampled audio
+            to a tensor of lenght `timing_dims` that will be added to the music tokens. 
+        single_enc_dec (`list`, *optional*, defaults to [True, False, False]):
+            Whether or not to use a single encoder-decoder architecture or split both modules and have 
+            a seperate `lyric_encoder` for each of the priors.  
+        metadata_conditioning (`bool`, *optional*, defaults to True):
+            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum duration. 
+        merged_decoder (`list`, *optional*, defaults to [True, False, False]):
+            # FIXME is that the same as single_enc_dec ?? 
+        lyric_conditioning (`list`, *optional*, defaults to [True, False, False]):
+            Whether or not to use the lyrics as conditioning. 
+        nb_relevant_lyric_tokens (`list`, *optional*, defaults to [384, 0, 0]):
+            Number of tokens that are used when sampling a single window of length `prior_n_ctx`
+        min_duration (`float`, *optional*, defaults to 17.84):
+            Minimum duration of the audios to generate
+        max_duration (`float`, *optional*, defaults to 600.0):
+            Maximum duration of the audios to generate
+        max_nb_genres (`int`, *optional*, defaults to 5):
+            Maximum number of genres that can be used to condition a single sample. 
+        init_std (`float`, *optional*, defaults to 0.2):
+            Standard deviation used to inital the model.
+        hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
+            # TODO detail this 
+        cond_zero_out (`bool`, *optional*, defaults to False):
+
+        cond_depth (`list`, *optional*, defaults to [3, 16, 16]):
+
+        cond_width (`list`, *optional*, defaults to [128, 1024, 1024]):
+
+        cond_dilation_growth_rate (`list`, *optional*, defaults to [1, 3, 3]):
+
+        cond_dilation_cycle (`list`, *optional*, defaults to [None, 8, 8]):
+
+        cond_res_scale (`list`, *optional*, defaults to [None, True, False]):
+
+        cond_m_conv (`int`, *optional*, defaults to 1):
+
+        cond_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
+
+        cond_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
+
+        lyric_enc_spread (`bool`, *optional*, defaults to False):
+
+        lyric_enc_width (`list`, *optional*, defaults to [128, 128, 128]):
+
+        lyric_enc_depth (`list`, *optional*, defaults to [18, 3, 3]):
+
+        lyric_enc_heads (`int`, *optional*, defaults to 4):
+
+        lyric_enc_m_attn (`float`, *optional*, defaults to 0.25):
+
+        lyric_enc_m_mlp (`float`, *optional*, defaults to 1.0):
+
+        lyric_enc_blocks (`int`, *optional*, defaults to 32):
+
+        lyric_enc_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):
+
+        lyric_enc_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
+
+        lyric_enc_attn_order (`list`, *optional*, defaults to [2, 0, 0]):
+
+        lyric_enc_attn_dropout (`float`, *optional*, defaults to 0.0):
+
+        lyric_enc_resid_dropout (`float`, *optional*, defaults to 0.0):
+
+        lyric_enc_emb_dropout (`float`, *optional*, defaults to 0.0):
+
+        lyric_enc_zero_out (`bool`, *optional*, defaults to False):
+
+        lyric_enc_res_scale (`bool`, *optional*, defaults to False):
+
+        lyric_enc_pos_init (`bool`, *optional*, defaults to False):
+
+        lyric_enc_n_vocab (`int`, *optional*, defaults to 79):
+
+        prior_init_scale (`list`, *optional*, defaults to [0.2, 1, 1]):
+
+        prior_spread (`bool`, *optional*, defaults to False):
+
+        prior_zero_out (`bool`, *optional*, defaults to False):
+
+        prior_res_scale (`bool`, *optional*, defaults to False):
+
+        prior_pos_init (`bool`, *optional*, defaults to False):
+
+        prior_n_ctx (`tuple`, *optional*, defaults to (6144, 8192, 8192)):
+
+        prior_latent_dim (`int`, *optional*, defaults to 2048):
+
+        prior_width (`list`, *optional*, defaults to [2048, 1920, 1920]):
+
+        prior_depth (`list`, *optional*, defaults to [72, 72, 72]):
+
+        prior_n_heads (`list`, *optional*, defaults to [2, 1, 1]):
+
+        prior_attn_order (`list`, *optional*, defaults to [12, 2, 2]):
+
+        prior_blocks (`int`, *optional*, defaults to 64):
+
+        prior_alignment_layer (`list`, *optional*, defaults to [68, None, None]):
+
+        prior_alignment_head (`list`, *optional*, defaults to [2, None, None]):
+
+        prior_m_attn (`float`, *optional*, defaults to 0.25):
+
+        prior_attn_dropout (`int`, *optional*, defaults to 0):
+
+        prior_resid_dropout (`int`, *optional*, defaults to 0):
+
+        prior_emb_dropout (`int`, *optional*, defaults to 0):
+
+        vqvae_levels (`int`, *optional*, defaults to 3):
+
+        vqvae_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
+
+        vqvae_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
+
+        vqvae_emmbedding_width (`int`, *optional*, defaults to 64):
+
+        vqvae_codebook_dimension (`int`, *optional*, defaults to 2048):
+
+        vqvae_width (`int`, *optional*, defaults to 32):
+
+        vqvae_depth (`int`, *optional*, defaults to 4):
+
+        vqvae_m_conv (`int`, *optional*, defaults to 1):
+
+        vqvae_dilation_growth_rate (`int`, *optional*, defaults to 3):
+
+        vqvae_dilation_cycle (`bool`, *optional*, defaults to False):
+
+        vqvae_multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
+
+        vqvae_lmu (`float`, *optional*, defaults to 0.99):
+
+        vqvae_commit (`float`, *optional*, defaults to 0.02):
+
+        vqvae_conv_block_depth (`int`, *optional*, defaults to 4):
+
+        vqvae_conv_block_width (`int`, *optional*, defaults to 32):
+
+        vqvae_reverse_decoder_dilation (`int`, *optional*, defaults to 1):
+
     Example:
 
     ```python
@@ -69,7 +223,6 @@ def __init__(
         self,
         sampling_rate=44100,
         metadata_dims=[(604, 7898), (120, 4111), (120, 4111)],
-        copy_input=False,
         nb_priors=3,
         timing_dims=64,
         single_enc_dec=[True, False, False],
@@ -144,9 +297,7 @@ def __init__(
         vqvae_reverse_decoder_dilation=1,
         **kwargs,
     ):
-
         self.init_std = init_std
-        self.copy_input = copy_input
         self.nb_priors = nb_priors
         self.hop_fraction = hop_fraction
 
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 9a08e8f2f0d60..2f03cb6c41fe5 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2056,10 +2056,6 @@ def rescale(music_tokens_shape):
         self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens[-level - 1]
         self.lyric_enc_loss_fraction = config.lyric_enc_loss_fraction[-level - 1]
 
-        self.copy_input = config.copy_input
-        if self.copy_input:
-            config.bins = config.prior_latent_dim
-
         self.music_tokens_shapes = music_tokens_shapes
         self.levels = len(self.music_tokens_shapes)
 
@@ -2505,8 +2501,6 @@ def forward_tokens(
         if get_attn_weights:
             self.prior.transformer.set_record_attn(get_attn_weights)
         audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
-        if self.copy_input:
-            lyric_tokens = music_tokens[:, : self.nb_relevant_lyric_tokens]
 
         if self.single_enc_dec:  # the preprocess returns the full tokens, shifted
             tokens, audio_conditioning = self.prior_preprocess(

From 6c982c0de56af358291c5679ccd185cd74918d02 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 18 Aug 2022 13:35:36 +0000
Subject: [PATCH 102/196] style

---
 .../models/jukebox/configuration_jukebox.py   | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 391f986b24411..2a444c4db5fa6 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -44,25 +44,26 @@ class JukeboxConfig(PretrainedConfig):
         sampling_rate (`int`, *optional*, defaults to 44100):
             Sampling rate of the raw audio.
         metadata_dims (`list`, *optional*, defaults to [(604, 7898), (120, 4111), (120, 4111)]):
-            List containing the number of genres and the number of artists that were used to train the 
-            embedding layers of each of the prior models. 
+            List containing the number of genres and the number of artists that were used to train the embedding layers
+            of each of the prior models.
         nb_priors (`int`, *optional*, defaults to 3):
-            Number of prior models that will sequentialy sample tokens. Each prior is conditional auto regressive (decoder) model, 
-            apart from the top prior, which can include a lyric encoder. The available models were trained using a top prior and 
-            2 upsampler priors.
+            Number of prior models that will sequentialy sample tokens. Each prior is conditional auto regressive
+            (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
+            trained using a top prior and 2 upsampler priors.
         timing_dims (`int`, *optional*, defaults to 64):
-            Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding layer. 
-            #TODO the timing embedding layer converts the absolute and relative position in the currently sampled audio
-            to a tensor of lenght `timing_dims` that will be added to the music tokens. 
+            Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
+            layer. #TODO the timing embedding layer converts the absolute and relative position in the currently
+            sampled audio to a tensor of lenght `timing_dims` that will be added to the music tokens.
         single_enc_dec (`list`, *optional*, defaults to [True, False, False]):
-            Whether or not to use a single encoder-decoder architecture or split both modules and have 
-            a seperate `lyric_encoder` for each of the priors.  
+            Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
+            `lyric_encoder` for each of the priors.
         metadata_conditioning (`bool`, *optional*, defaults to True):
-            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum duration. 
+            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
+            duration.
         merged_decoder (`list`, *optional*, defaults to [True, False, False]):
-            # FIXME is that the same as single_enc_dec ?? 
+            # FIXME is that the same as single_enc_dec ??
         lyric_conditioning (`list`, *optional*, defaults to [True, False, False]):
-            Whether or not to use the lyrics as conditioning. 
+            Whether or not to use the lyrics as conditioning.
         nb_relevant_lyric_tokens (`list`, *optional*, defaults to [384, 0, 0]):
             Number of tokens that are used when sampling a single window of length `prior_n_ctx`
         min_duration (`float`, *optional*, defaults to 17.84):
@@ -70,11 +71,11 @@ class JukeboxConfig(PretrainedConfig):
         max_duration (`float`, *optional*, defaults to 600.0):
             Maximum duration of the audios to generate
         max_nb_genres (`int`, *optional*, defaults to 5):
-            Maximum number of genres that can be used to condition a single sample. 
+            Maximum number of genres that can be used to condition a single sample.
         init_std (`float`, *optional*, defaults to 0.2):
             Standard deviation used to inital the model.
         hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
-            # TODO detail this 
+            # TODO detail this
         cond_zero_out (`bool`, *optional*, defaults to False):
 
         cond_depth (`list`, *optional*, defaults to [3, 16, 16]):

From f87de9b8e3d881b7ae7402eabc22ee5440c7d850 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 18 Aug 2022 14:00:02 +0000
Subject: [PATCH 103/196] update doc

---
 .../models/jukebox/configuration_jukebox.py   | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 2a444c4db5fa6..558888d008cdc 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -77,35 +77,37 @@ class JukeboxConfig(PretrainedConfig):
         hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
             # TODO detail this
         cond_zero_out (`bool`, *optional*, defaults to False):
-
+            Zero out weights when initialising. 
         cond_depth (`list`, *optional*, defaults to [3, 16, 16]):
-
+            Number of layers to use for the music conditioner. 
         cond_width (`list`, *optional*, defaults to [128, 1024, 1024]):
-
+            Width of the audio conditioning layer. 
         cond_dilation_growth_rate (`list`, *optional*, defaults to [1, 3, 3]):
-
+            Dilation grow rate used between each convolutionnal block. 
         cond_dilation_cycle (`list`, *optional*, defaults to [None, 8, 8]):
-
+            Cycle of dilation to use. Usually similar to the ones used in the VQVAE.
         cond_res_scale (`list`, *optional*, defaults to [None, True, False]):
-
+            Wheter or not to scale the residuals in the audio conditionner block. 
+            Since the top level prior doeas not have a conditionner, the default value is to None
+            and should not be modified. 
         cond_m_conv (`int`, *optional*, defaults to 1):
-
+            # TODO no idea what that really corresponds to? 
         cond_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
-
+            Downsampling ... # TODO 
         cond_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
-
+            Striding pattern to use #TODO 
         lyric_enc_spread (`bool`, *optional*, defaults to False):
-
+            Spread used in the attention pattern #TODO check what that is actually 
         lyric_enc_width (`list`, *optional*, defaults to [128, 128, 128]):
-
+            Width of the lyric encoder
         lyric_enc_depth (`list`, *optional*, defaults to [18, 3, 3]):
-
+            Number of blocks used in the lyric encoder is this different from lyric_enc_blocks? FIXME
         lyric_enc_heads (`int`, *optional*, defaults to 4):
-
+            Number of heads in the lyric encoder
         lyric_enc_m_attn (`float`, *optional*, defaults to 0.25):
-
+            # again, m_attn and m_mlp, I don't really know how to rename it
         lyric_enc_m_mlp (`float`, *optional*, defaults to 1.0):
-
+            # again, m_attn and m_mlp, I don't really know how to rename it
         lyric_enc_blocks (`int`, *optional*, defaults to 32):
 
         lyric_enc_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):

From e70b4c815454e81c8b47b84bd2326ee2da68cbef Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 18 Aug 2022 15:22:35 +0000
Subject: [PATCH 104/196] update

---
 .../models/jukebox/configuration_jukebox.py   | 50 ++++++++++---------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 558888d008cdc..f43dc278e365f 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -109,41 +109,42 @@ class JukeboxConfig(PretrainedConfig):
         lyric_enc_m_mlp (`float`, *optional*, defaults to 1.0):
             # again, m_attn and m_mlp, I don't really know how to rename it
         lyric_enc_blocks (`int`, *optional*, defaults to 32):
-
+            
         lyric_enc_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):
-
+            
         lyric_enc_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
-
+            
         lyric_enc_attn_order (`list`, *optional*, defaults to [2, 0, 0]):
-
+            Which attention pattern to use for the lyric encoder
         lyric_enc_attn_dropout (`float`, *optional*, defaults to 0.0):
-
+            
         lyric_enc_resid_dropout (`float`, *optional*, defaults to 0.0):
-
+            
         lyric_enc_emb_dropout (`float`, *optional*, defaults to 0.0):
-
+            
         lyric_enc_zero_out (`bool`, *optional*, defaults to False):
-
+            
         lyric_enc_res_scale (`bool`, *optional*, defaults to False):
-
+            
         lyric_enc_pos_init (`bool`, *optional*, defaults to False):
-
+            
         lyric_enc_n_vocab (`int`, *optional*, defaults to 79):
-
+            
         prior_init_scale (`list`, *optional*, defaults to [0.2, 1, 1]):
-
+            
         prior_spread (`bool`, *optional*, defaults to False):
-
+            
         prior_zero_out (`bool`, *optional*, defaults to False):
-
+            
         prior_res_scale (`bool`, *optional*, defaults to False):
-
+            
         prior_pos_init (`bool`, *optional*, defaults to False):
-
+            
         prior_n_ctx (`tuple`, *optional*, defaults to (6144, 8192, 8192)):
-
+            Number of context tokens for each prior. The context tokens are the music tokens that are 
+            attended to when generating music tokens. 
         prior_latent_dim (`int`, *optional*, defaults to 2048):
-
+            Dimension of the latent music token space. Default value match the `vqvae_codebook_dimension`. 
         prior_width (`list`, *optional*, defaults to [2048, 1920, 1920]):
 
         prior_depth (`list`, *optional*, defaults to [72, 72, 72]):
@@ -151,13 +152,14 @@ class JukeboxConfig(PretrainedConfig):
         prior_n_heads (`list`, *optional*, defaults to [2, 1, 1]):
 
         prior_attn_order (`list`, *optional*, defaults to [12, 2, 2]):
-
+            Attention patterns to use in each prior. Depending on the value, cross attention, block attention and sparse 
+            attention blocks are stacked. 
         prior_blocks (`int`, *optional*, defaults to 64):
 
         prior_alignment_layer (`list`, *optional*, defaults to [68, None, None]):
-
+            Layer corresponding to the alignemnt between the lyrics and the audio. 
         prior_alignment_head (`list`, *optional*, defaults to [2, None, None]):
-
+            Index of the attention head which takes care of the alignemnt between the lyrics and the audio. 
         prior_m_attn (`float`, *optional*, defaults to 0.25):
 
         prior_attn_dropout (`int`, *optional*, defaults to 0):
@@ -167,15 +169,15 @@ class JukeboxConfig(PretrainedConfig):
         prior_emb_dropout (`int`, *optional*, defaults to 0):
 
         vqvae_levels (`int`, *optional*, defaults to 3):
-
+            Number of hierachical levels that used in the VQVAE. 
         vqvae_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
 
         vqvae_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
 
         vqvae_emmbedding_width (`int`, *optional*, defaults to 64):
-
+            Dimension of the codebook vectors.
         vqvae_codebook_dimension (`int`, *optional*, defaults to 2048):
-
+            Number of codes to use in each of the VQVAE. 
         vqvae_width (`int`, *optional*, defaults to 32):
 
         vqvae_depth (`int`, *optional*, defaults to 4):

From a6462e28f6b10a679716a50137312bd9b58a2d5f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 19 Aug 2022 11:52:48 +0000
Subject: [PATCH 105/196] fix test update doc

---
 docs/source/en/model_doc/jukebox.mdx          | 30 ++++++++++---------
 .../models/jukebox/configuration_jukebox.py   |  2 +-
 .../models/jukebox/modeling_jukebox.py        |  4 +--
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 614e0ff2dd23f..b6039ac1c6dbb 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -22,21 +22,17 @@ artist, genre and lyrics.
 
 The abstract from the paper is the following:
 
-We introduce Jukebox, a model that generates
-music with singing in the raw audio domain. We
-tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes,
-and modeling those using autoregressive Transformers. We show that the combined model at
-scale can generate high-fidelity and diverse songs
-with coherence up to multiple minutes. We can
-condition on artist and genre to steer the musical
-and vocal style, and on unaligned lyrics to make
-the singing more controllable. We are releasing
-thousands of non cherry-picked samples, along
-with model weights and code.
+We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.
 
-Tips:
+As shown on the following figure, Jukebox is made of 3 `priors` which are decoders only. They follow a particular architecture described in `Scalable Transformers` #TODO add link to the paper. 
+An encoder model is used on the lyrics, on the first (also called `top_prior`) prior's decoder attends to the last lyric hidden states. Each prior is linked to the previous by an `AudioConditionner` module which takes care of upsampling the generated hidden state to the correct `raw_to_token` resolution.
+The metadatas such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. 
+
+![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/2a75e9ab330ef27ae6565fcf28b129e033ff9bed/Jukebox.svg)
 
-This model is very slow for now, and takes 18h to generate a minute long audio. 
+Tips:
+- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior.
+- Primed sampling requires more memory than ancestral sampling and should be used with `fp16` set to `True`. 
 
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
 The original code can be found [here](https://github.com/openai/jukebox).
@@ -53,9 +49,15 @@ The original code can be found [here](https://github.com/openai/jukebox).
 ## JukeboxModel
 
 [[autodoc]] JukeboxModel 
-    - forward 
+    - primed_sample
+    - ancestral_sample 
+    - continue_sample
+    - upsample
+    - _sample
 
 ## JukeboxVQVAE 
 
 [[autodoc]] JukeboxVQVAE 
     - forward 
+    - encode 
+    - decode
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index f43dc278e365f..f315430cc3e06 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -91,7 +91,7 @@ class JukeboxConfig(PretrainedConfig):
             Since the top level prior doeas not have a conditionner, the default value is to None
             and should not be modified. 
         cond_m_conv (`int`, *optional*, defaults to 1):
-            # TODO no idea what that really corresponds to? 
+            Conditionner multiplier (the input states are mulitplied by that parameter for each convolution. 
         cond_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
             Downsampling ... # TODO 
         cond_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2f03cb6c41fe5..52e51c7a05fac 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1673,7 +1673,7 @@ def sample(
         else:
             return tokens
 
-    def split_chunks(length, chunk_size):
+    def split_chunks(self,length, chunk_size):
         n_passes = (length + chunk_size - 1) // chunk_size
         chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
         return chunk_sizes
@@ -2738,7 +2738,7 @@ def __init__(self, config):
         config.vqvae_music_tokens_shapes = self.vqvae.music_tokens_shapes
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
-    def split_batch(obj, n_samples, split_size):
+    def split_batch(self, obj, n_samples, split_size):
         n_passes = (n_samples + split_size - 1) // split_size
         if isinstance(obj, torch.Tensor):
             return torch.split(obj, split_size, dim=0)

From ad681ca05f18d0656b8ac8774ebd24e0a8240354 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 19 Aug 2022 11:56:45 +0000
Subject: [PATCH 106/196] style

---
 .../models/jukebox/configuration_jukebox.py   | 69 +++++++++----------
 .../models/jukebox/modeling_jukebox.py        |  2 +-
 2 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index f315430cc3e06..4af596725be97 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -77,27 +77,26 @@ class JukeboxConfig(PretrainedConfig):
         hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
             # TODO detail this
         cond_zero_out (`bool`, *optional*, defaults to False):
-            Zero out weights when initialising. 
+            Zero out weights when initialising.
         cond_depth (`list`, *optional*, defaults to [3, 16, 16]):
-            Number of layers to use for the music conditioner. 
+            Number of layers to use for the music conditioner.
         cond_width (`list`, *optional*, defaults to [128, 1024, 1024]):
-            Width of the audio conditioning layer. 
+            Width of the audio conditioning layer.
         cond_dilation_growth_rate (`list`, *optional*, defaults to [1, 3, 3]):
-            Dilation grow rate used between each convolutionnal block. 
+            Dilation grow rate used between each convolutionnal block.
         cond_dilation_cycle (`list`, *optional*, defaults to [None, 8, 8]):
             Cycle of dilation to use. Usually similar to the ones used in the VQVAE.
         cond_res_scale (`list`, *optional*, defaults to [None, True, False]):
-            Wheter or not to scale the residuals in the audio conditionner block. 
-            Since the top level prior doeas not have a conditionner, the default value is to None
-            and should not be modified. 
+            Wheter or not to scale the residuals in the audio conditionner block. Since the top level prior doeas not
+            have a conditionner, the default value is to None and should not be modified.
         cond_m_conv (`int`, *optional*, defaults to 1):
-            Conditionner multiplier (the input states are mulitplied by that parameter for each convolution. 
+            Conditionner multiplier (the input states are mulitplied by that parameter for each convolution.
         cond_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
-            Downsampling ... # TODO 
+            Downsampling ... # TODO
         cond_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
-            Striding pattern to use #TODO 
+            Striding pattern to use #TODO
         lyric_enc_spread (`bool`, *optional*, defaults to False):
-            Spread used in the attention pattern #TODO check what that is actually 
+            Spread used in the attention pattern #TODO check what that is actually
         lyric_enc_width (`list`, *optional*, defaults to [128, 128, 128]):
             Width of the lyric encoder
         lyric_enc_depth (`list`, *optional*, defaults to [18, 3, 3]):
@@ -109,42 +108,42 @@ class JukeboxConfig(PretrainedConfig):
         lyric_enc_m_mlp (`float`, *optional*, defaults to 1.0):
             # again, m_attn and m_mlp, I don't really know how to rename it
         lyric_enc_blocks (`int`, *optional*, defaults to 32):
-            
+
         lyric_enc_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):
-            
+
         lyric_enc_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
-            
+
         lyric_enc_attn_order (`list`, *optional*, defaults to [2, 0, 0]):
             Which attention pattern to use for the lyric encoder
         lyric_enc_attn_dropout (`float`, *optional*, defaults to 0.0):
-            
+
         lyric_enc_resid_dropout (`float`, *optional*, defaults to 0.0):
-            
+
         lyric_enc_emb_dropout (`float`, *optional*, defaults to 0.0):
-            
+
         lyric_enc_zero_out (`bool`, *optional*, defaults to False):
-            
+
         lyric_enc_res_scale (`bool`, *optional*, defaults to False):
-            
+
         lyric_enc_pos_init (`bool`, *optional*, defaults to False):
-            
+
         lyric_enc_n_vocab (`int`, *optional*, defaults to 79):
-            
+
         prior_init_scale (`list`, *optional*, defaults to [0.2, 1, 1]):
-            
+
         prior_spread (`bool`, *optional*, defaults to False):
-            
+
         prior_zero_out (`bool`, *optional*, defaults to False):
-            
+
         prior_res_scale (`bool`, *optional*, defaults to False):
-            
+
         prior_pos_init (`bool`, *optional*, defaults to False):
-            
+
         prior_n_ctx (`tuple`, *optional*, defaults to (6144, 8192, 8192)):
-            Number of context tokens for each prior. The context tokens are the music tokens that are 
-            attended to when generating music tokens. 
+            Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
+            generating music tokens.
         prior_latent_dim (`int`, *optional*, defaults to 2048):
-            Dimension of the latent music token space. Default value match the `vqvae_codebook_dimension`. 
+            Dimension of the latent music token space. Default value match the `vqvae_codebook_dimension`.
         prior_width (`list`, *optional*, defaults to [2048, 1920, 1920]):
 
         prior_depth (`list`, *optional*, defaults to [72, 72, 72]):
@@ -152,14 +151,14 @@ class JukeboxConfig(PretrainedConfig):
         prior_n_heads (`list`, *optional*, defaults to [2, 1, 1]):
 
         prior_attn_order (`list`, *optional*, defaults to [12, 2, 2]):
-            Attention patterns to use in each prior. Depending on the value, cross attention, block attention and sparse 
-            attention blocks are stacked. 
+            Attention patterns to use in each prior. Depending on the value, cross attention, block attention and
+            sparse attention blocks are stacked.
         prior_blocks (`int`, *optional*, defaults to 64):
 
         prior_alignment_layer (`list`, *optional*, defaults to [68, None, None]):
-            Layer corresponding to the alignemnt between the lyrics and the audio. 
+            Layer corresponding to the alignemnt between the lyrics and the audio.
         prior_alignment_head (`list`, *optional*, defaults to [2, None, None]):
-            Index of the attention head which takes care of the alignemnt between the lyrics and the audio. 
+            Index of the attention head which takes care of the alignemnt between the lyrics and the audio.
         prior_m_attn (`float`, *optional*, defaults to 0.25):
 
         prior_attn_dropout (`int`, *optional*, defaults to 0):
@@ -169,7 +168,7 @@ class JukeboxConfig(PretrainedConfig):
         prior_emb_dropout (`int`, *optional*, defaults to 0):
 
         vqvae_levels (`int`, *optional*, defaults to 3):
-            Number of hierachical levels that used in the VQVAE. 
+            Number of hierachical levels that used in the VQVAE.
         vqvae_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
 
         vqvae_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
@@ -177,7 +176,7 @@ class JukeboxConfig(PretrainedConfig):
         vqvae_emmbedding_width (`int`, *optional*, defaults to 64):
             Dimension of the codebook vectors.
         vqvae_codebook_dimension (`int`, *optional*, defaults to 2048):
-            Number of codes to use in each of the VQVAE. 
+            Number of codes to use in each of the VQVAE.
         vqvae_width (`int`, *optional*, defaults to 32):
 
         vqvae_depth (`int`, *optional*, defaults to 4):
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 52e51c7a05fac..b5473af27068f 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1673,7 +1673,7 @@ def sample(
         else:
             return tokens
 
-    def split_chunks(self,length, chunk_size):
+    def split_chunks(self, length, chunk_size):
         n_passes = (length + chunk_size - 1) // chunk_size
         chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
         return chunk_sizes

From f3590dab1140de6ea58654c5afafb81029b3569e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 20 Aug 2022 08:28:03 +0000
Subject: [PATCH 107/196] update

---
 .../models/jukebox/configuration_jukebox.py   |  2 +-
 .../models/jukebox/modeling_jukebox.py        | 61 ++++++++-----------
 2 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 4af596725be97..f96c2b9159e82 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -75,7 +75,7 @@ class JukeboxConfig(PretrainedConfig):
         init_std (`float`, *optional*, defaults to 0.2):
             Standard deviation used to inital the model.
         hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
-            # TODO detail this
+            # TODO detail this amount of space between each of the sampling windows oif `n_ctx` tokens
         cond_zero_out (`bool`, *optional*, defaults to False):
             Zero out weights when initialising.
         cond_depth (`list`, *optional*, defaults to [3, 16, 16]):
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index b5473af27068f..b46a416f6f630 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1381,7 +1381,7 @@ def forward(self, hidden_states, lyric_encoder_states=None, sample=False, fp16=F
             else:
                 hidden_states = attn_layer(hidden_states, lyric_encoder_states=None, sample=sample)
             if attn_layer.attn.record_attn:
-                self.saved_attn_weights.append(attn_layer.attn.w)
+                self.saved_attn_weights.append(attn_layer.attn.c_attn.weight)
         if not fp16_out:
             hidden_states = hidden_states.float()
         return hidden_states
@@ -1563,7 +1563,7 @@ def forward(
         )  # Pos emb and dropout
 
         hidden_states = self.transformer(
-            hidden_states, lyric_encoder_states=lyric_encoder_states, fp16=fp16
+            hidden_states, lyric_encoder_states=lyric_encoder_states, fp16=fp16, fp16_out=fp16
         )  # Transformer
         if self.add_cond_after_transformer:  # Piped doesnt add x_cond
             hidden_states = hidden_states + audio_conditioning
@@ -1647,7 +1647,7 @@ def sample(
                 )
                 self.transformer.check_cache(n_samples, sample_t, fp16)
                 hidden_states = self.transformer(
-                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16
+                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16, fp16_out=fp16
                 )
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
@@ -2361,7 +2361,7 @@ def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
 
     def get_cond(self, music_tokens_conds, metadata):
         """
-        Converts the tokens to the input_embeddings. Splits the lyrics and the metadata. Lyric tokens can be None
+        Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens can be None.
         """
         if metadata is not None:
             n_labels = metadata.shape[1] - self.nb_relevant_lyric_tokens
@@ -2490,7 +2490,7 @@ def forward_tokens(
         self, music_tokens, music_tokens_conds=[], metadata=None, fp16=False, get_preds=False, get_attn_weights=False
     ):
         """
-        Applies a forward pass using the conditioning tokens. Different from the classif forward as it does not use the
+        Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the
         vqvae's encoding layers.
 
         Args:
@@ -2571,8 +2571,6 @@ def _init_weights(self, module):
                 module.bias.data.zero_()
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -2612,6 +2610,8 @@ def get_starts(total_length, n_ctx, hop_length):
 def get_alignment(music_tokens, labels, prior, fp16, config):
     """
     Compute the lyric to music token alignment, but for now it cannot be used.
+
+    IN THE Oiginal code,
     """
     level = prior.levels - 1  # Top level used
     n_ctx = prior.n_ctx
@@ -2632,20 +2632,20 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
     alignment_hops = {}
     indices_hops = {}
     # prior.to(tokens.device)
-    prior.to("cpu")
+    prior.to("cuda")
     gc.collect()
     torch.cuda.empty_cache()
     for start in get_range(get_starts(total_length, n_ctx, hop_length)):
         end = start + n_ctx
         # set metadata offset, sample_length and lyrics tokens
         metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
-        metadata.to("cpu")
+        metadata.to("cuda")
         tokens_bs = torch.chunk(tokens, batch_size, dim=0)
         metadata_bs = torch.chunk(metadata, batch_size, dim=0)
         w_hops = []
         for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
-            tokens_i = tokens_i.to("cpu")
-            metadata_i = metadata_i.to("cpu")
+            tokens_i = tokens_i.to("cuda")
+            metadata_i = metadata_i.to("cuda")
             w_hop = prior.forward_tokens(
                 tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers
             )
@@ -2669,13 +2669,12 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
     alignments = []
     for item in range(batch_size):
         # Note each item has different length lyrics
-        full_tokens = labels[:, 3:]
+        full_tokens = labels[0, 3:]
         alignment = np.zeros((total_length, len(full_tokens) + 1))
         for start in reversed(get_starts(total_length, n_ctx, hop_length)):
             end = start + n_ctx
             alignment_hop = alignment_hops[start][item]
             indices = indices_hops[start][item]
-
             alignment[start:end, indices] = alignment_hop
         alignment = alignment[: total_length - padding_length, :-1]  # remove token padding, and last lyric index
         alignments.append(alignment)
@@ -2844,7 +2843,7 @@ def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, tot
             )
         return music_tokens
 
-    # Sample multiple levels
+    @torch.no_grad()
     def _sample(
         self,
         music_tokens,
@@ -2864,12 +2863,13 @@ def _sample(
         fp16=False,
     ):
         top_prior = self.priors[-1]
-        total_length = (
-            sample_length
-            if sample_length is not None
-            else (int(sample_length_in_seconds * self.config.sampling_rate) // top_prior.raw_to_tokens)
-            * top_prior.raw_to_tokens
-        )
+        if sample_length is not None:
+            total_length = sample_length
+        else:
+            total_length = (
+                int(sample_length_in_seconds * self.config.sampling_rate) // top_prior.raw_to_tokens
+            ) * top_prior.raw_to_tokens
+
         sampling_kwargs = [
             dict(
                 temp=0.99,
@@ -2932,17 +2932,10 @@ def _sample(
                 if alignments is None and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
                     gc.collect()
                     torch.cuda.empty_cache()
-                    # should memory profile this function it takes up to 77% of 88GB RAM which seems to be a loooooooot 96%
-                    # has to be either a memory leak or, since all the attention is stored, everything is stored and it
-                    # is not optimal
-                    alignments = get_alignment(
-                        music_tokens,
-                        labels[-1],
-                        self.priors[-1],
-                        sampling_kwargs[-1]["fp16"],
-                        self.config,
-                    )
-                    pass  # consumes too much ram
+                    with torch.no_grad():
+                        alignments = get_alignment(music_tokens, labels[-1], self.priors[-1], fp16, self.config)
+                    torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")
+                    # disable saving to html, TODO should we do it
         return music_tokens
 
     # Generate ancestral samples given a list of artists and genres
@@ -2979,9 +2972,3 @@ def primed_sample(self, raw_audio, labels, **sampling_kwargs):
 
 
 # TODO add tied embeddings for the lyric encoder lm head as well as the proj_out when they are not seperated.
-
-
-# Training the prior on next token prediction using a bert tokenizer would make more sens than only predicting the letter
-# Indeed the model in unconditional sampling does not generate proper lyrics. Thus should have been
-# That is why we hear jibbrish. Could also have 3 levels of encoding were you predict entires sentences corresponding to the
-# highest level, then lower level words etc

From ab624d91f161e0e08c2ce397540da83a7bf8b6b2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 12 Sep 2022 08:49:36 +0000
Subject: [PATCH 108/196] nits

---
 src/transformers/models/jukebox/modeling_jukebox.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index b46a416f6f630..aba6a1bc600a8 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2361,7 +2361,8 @@ def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
 
     def get_cond(self, music_tokens_conds, metadata):
         """
-        Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens can be None.
+        Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens
+        can be None.
         """
         if metadata is not None:
             n_labels = metadata.shape[1] - self.nb_relevant_lyric_tokens

From 130260fe9fdfb43374e5b09b578c4e7685b46b80 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 08:06:43 +0000
Subject: [PATCH 109/196] update code add docstring remove oneline function

---
 .../models/jukebox/modeling_jukebox.py        | 124 +++++++++++++-----
 1 file changed, 89 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index aba6a1bc600a8..28b7208ff8a59 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -306,20 +306,6 @@ def forward(self, hidden_states, all_levels=True):
         return hidden_state
 
 
-def dont_update(params):
-    for param in params:
-        param.requires_grad = False
-
-
-def update(params):
-    for param in params:
-        param.requires_grad = True
-
-
-def calculate_strides(strides, downs):
-    return [stride**down for stride, down in zip(strides, downs)]
-
-
 class JukeboxBottleneckBlock(nn.Module):
     def __init__(self, codebook_dim, codebook_width, mu):
         super().__init__()
@@ -526,10 +512,22 @@ def forward(self, input_audio):
 
 
 class JukeboxVQVAE(PreTrainedModel):
+    """
+
+    Args:
+        PreTrainedModel (_type_): _description_
+
+    Raises:
+        NotImplementedError: _description_ TypeError: _description_
+
+    Returns:
+        _type_: _description_
+    """
+
     def __init__(self, config):
         super().__init__(config)
         if not config.sample_length:
-            downsamples = calculate_strides(config.vqvae_strides_t, config.vqvae_downs_t)
+            downsamples = [stride**down for stride, down in zip(config.vqvae_strides_t, config.vqvae_down_t)]
             top_raw_to_tokens = np.prod(downsamples)
             config.sample_length = (
                 (config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens) * top_raw_to_tokens
@@ -559,7 +557,7 @@ def __init__(self, config):
         x_shape, x_channels = input_shape[:-1], input_shape[-1]
         self.x_shape = x_shape
 
-        self.downsamples = calculate_strides(strides_t, downs_t)
+        self.downsamples = [stride**down for stride, down in zip(strides_t, downs_t)]
         self.hop_lengths = np.cumprod(self.downsamples)
         self.levels = levels = config.vqvae_levels
         self.music_tokens_shapes = [(int(x_shape[0] // self.hop_lengths[-level - 1]),) for level in range(levels)]
@@ -717,18 +715,6 @@ def forward(self, input):
             return super(JukeboxLayerNorm, self).forward(input).type_as(input)
 
 
-def repeat(hidden_states, n_repeat, dim):
-    if dim == -1:
-        dim = len(hidden_states.shape) - 1
-    return (
-        hidden_states.view(
-            int(np.prod(hidden_states.shape[: dim + 1])), 1, int(np.prod(hidden_states.shape[dim + 1 :]))
-        )
-        .repeat(1, n_repeat, 1)
-        .view(*hidden_states.shape[:dim], n_repeat * hidden_states.shape[dim], *hidden_states.shape[dim + 1 :])
-    )
-
-
 def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t):
     # returns a mask of shape 1 x 1 x query_length x key_value_length or None if masking is not needed.
     if mask is None or query_length == 1:
@@ -756,7 +742,6 @@ def get_mask(mask, query_length, key_value_length, blocks, spread, device, sampl
 
 
 class JukeboxAttention(nn.Module):
-    # previously FactoredAttention
     def __init__(
         self,
         width,
@@ -2215,7 +2200,7 @@ def conditioner_block(_level):
             )
 
         self.n_ctx = self.gen_loss_dims
-        self.downsamples = calculate_strides(config.cond_strides_t, config.cond_downs_t)
+        self.downsamples = [stride**down for stride, down in zip(config.cond_strides_t, config.cond_downs_t)]
         self.cond_downsample = self.downsamples[level + 1] if level != self.levels - 1 else None
         self.raw_to_tokens = np.prod(self.downsamples[: level + 1])
         self.sample_length = self.n_ctx * self.raw_to_tokens
@@ -2725,8 +2710,34 @@ def load_prompts(audio_files, hps, sample_length_in_seconds=70, offset_in_second
     return raw_audio
 
 
+JUKEBOX_SAMPLING_INPUT_DOCSTRING = r"""
+            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_lenght)` :
+                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
+                condition the generation.
+            sampling_kwargs (`Dict[Any]`): 
+                Various additional sampling arguments that are used by the `_sample` function.
+                - metas=None,
+                - chunk_size=32,
+                - sampling_temperature=0.98,
+                - lower_batch_size=16,
+                - max_batch_size=16,
+                - sample_length_in_seconds=24,
+                - alignments=None,
+                - sample_tokens=None,
+                - offset=0,
+                - save_results=True,
+                - sample_length=None,
+                - fp16=False,
+
+"""
+
+
 @add_start_docstrings(
-    "The bare JUKEBOX Model from which you can sample",
+    """The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`,
+`continue_sample` and `ancestral_sample`.
+    It does not have a `forward` method as the training is not end to end. If you want to fine tune the model, it is
+    recommended to use the `JukeboxPrior` class and train each prior individually.
+    """,
     JUKEBOX_START_DOCSTRING,
 )
 class JukeboxModel(JukeboxPreTrainedModel):
@@ -2738,6 +2749,12 @@ def __init__(self, config):
         config.vqvae_music_tokens_shapes = self.vqvae.music_tokens_shapes
         self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
 
+    def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):
+        return self.vqvae.decode(music_tokens, start_level, end_level, bs_chunks)
+
+    def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
+        return self.vqvae.encode(input_audio, start_level, end_level, bs_chunks)
+
     def split_batch(self, obj, n_samples, split_size):
         n_passes = (n_samples + split_size - 1) // split_size
         if isinstance(obj, torch.Tensor):
@@ -2939,8 +2956,16 @@ def _sample(
                     # disable saving to html, TODO should we do it
         return music_tokens
 
-    # Generate ancestral samples given a list of artists and genres
+    @add_start_docstrings(
+        """
+        Args: 
+        Generate music tokens based on the provided `labels. Will start at the desired prior level and automatically
+        upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
+        the VQ-VAE decoder to convert the music tokens to raw audio.""",
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
+
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         music_tokens = [
             torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))
@@ -2948,19 +2973,48 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
 
-    # Continue ancestral sampling from previously saved codes
+    @add_start_docstrings(
+        """
+        Args: 
+        Generate a continuation of the previously generated tokens.
+            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) : 
+                A sequence of music tokens which will be used as context to continue the sampling process. Should have
+                `self.levels` tensors, each corresponding to the generation at a certain level.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
     def continue_sample(self, music_tokens, labels, **sampling_kwargs):
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
 
-    # Upsample given already generated upper-level codes
+    @add_start_docstrings(
+        """
+        Args: 
+        Upsamples a sequence of music tokens using the prior at level `level`.
+            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) : 
+                A sequence of music tokens which will be used as context to continue the sampling process. Should have
+                `self.levels` tensors, each corresponding to the generation at a certain level.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
     def upsample(self, music_tokens, labels, **sampling_kwargs):
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors) - 1)))
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
 
-    # Prompt the model with raw audio input (dimension: NTC) and generate continuations
+    @add_start_docstrings(
+        """
+        Args: 
+        Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
+        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are used
+        as conditioning for each level, which means that no ancestral sampling is required.
+            raw_audio (`List[torch.Tensor`] of length `n_samples` ) : 
+                A list of raw audio that will be used as conditioning information for each samples that will be
+                generated.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
     def primed_sample(self, raw_audio, labels, **sampling_kwargs):
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         self.vqvae.to(raw_audio.device).float()

From 004e78b219f295c6d935bd4e83e1498bc634672a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 08:21:29 +0000
Subject: [PATCH 110/196] modre doc

---
 .../models/jukebox/modeling_jukebox.py        | 63 +++++++++++--------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 28b7208ff8a59..63fa43b37ea03 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -515,13 +515,13 @@ class JukeboxVQVAE(PreTrainedModel):
     """
 
     Args:
-        PreTrainedModel (_type_): _description_
+        PreTrainedModel (`__type__`): _description_
 
     Raises:
         NotImplementedError: _description_ TypeError: _description_
 
     Returns:
-        _type_: _description_
+        `__type__`: _description_
     """
 
     def __init__(self, config):
@@ -2714,20 +2714,9 @@ def load_prompts(audio_files, hps, sample_length_in_seconds=70, offset_in_second
             labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_lenght)` :
                 List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
                 condition the generation.
-            sampling_kwargs (`Dict[Any]`): 
-                Various additional sampling arguments that are used by the `_sample` function.
-                - metas=None,
-                - chunk_size=32,
-                - sampling_temperature=0.98,
-                - lower_batch_size=16,
-                - max_batch_size=16,
-                - sample_length_in_seconds=24,
-                - alignments=None,
-                - sample_tokens=None,
-                - offset=0,
-                - save_results=True,
-                - sample_length=None,
-                - fp16=False,
+            sampling_kwargs (`Dict[Any]`):
+                Various additional sampling arguments that are used by the `_sample` function. A detail list of the
+                arguments can bee seen in the [`_sample`] function documentation.
 
 """
 
@@ -2880,6 +2869,29 @@ def _sample(
         sample_length=None,
         fp16=False,
     ):
+        """_summary_
+
+        Args:
+            music_tokens (`__type__`): _description_
+            labels (`__type__`): _description_
+            sample_levels (`__type__`): _description_
+            metas (`__type__`, optional): _description_. Defaults to None.
+            chunk_size (int, optional): _description_. Defaults to 32.
+            sampling_temperature (float, optional): _description_. Defaults to 0.98.
+            lower_batch_size (int, optional): _description_. Defaults to 16.
+            max_batch_size (int, optional): _description_. Defaults to 16.
+            sample_length_in_seconds (int, optional): _description_. Defaults to 24.
+            alignments (`__type__`, optional): _description_. Defaults to None.
+            sample_tokens (`__type__`, optional): _description_. Defaults to None.
+            offset (int, optional): _description_. Defaults to 0.
+            save_results (bool, optional): _description_. Defaults to True.
+            sample_length (`__type__`, optional): _description_. Defaults to None.
+            fp16 (bool, optional): _description_. Defaults to False.
+
+        Returns:
+            `__type__`: _description_
+        """
+
         top_prior = self.priors[-1]
         if sample_length is not None:
             total_length = sample_length
@@ -2958,7 +2970,7 @@ def _sample(
 
     @add_start_docstrings(
         """
-        Args: 
+        Args:
         Generate music tokens based on the provided `labels. Will start at the desired prior level and automatically
         upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
         the VQ-VAE decoder to convert the music tokens to raw audio.""",
@@ -2975,9 +2987,9 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
 
     @add_start_docstrings(
         """
-        Args: 
+        Args:
         Generate a continuation of the previously generated tokens.
-            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) : 
+            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
         """,
@@ -2990,9 +3002,9 @@ def continue_sample(self, music_tokens, labels, **sampling_kwargs):
 
     @add_start_docstrings(
         """
-        Args: 
+        Args:
         Upsamples a sequence of music tokens using the prior at level `level`.
-            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) : 
+            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
         """,
@@ -3005,11 +3017,11 @@ def upsample(self, music_tokens, labels, **sampling_kwargs):
 
     @add_start_docstrings(
         """
-        Args: 
+        Args:
         Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
-        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are used
+        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are used:
         as conditioning for each level, which means that no ancestral sampling is required.
-            raw_audio (`List[torch.Tensor`] of length `n_samples` ) : 
+            raw_audio (`List[torch.Tensor`] of length `n_samples` ) :
                 A list of raw audio that will be used as conditioning information for each samples that will be
                 generated.
         """,
@@ -3024,6 +3036,3 @@ def primed_sample(self, raw_audio, labels, **sampling_kwargs):
             )
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
-
-
-# TODO add tied embeddings for the lyric encoder lm head as well as the proj_out when they are not seperated.

From dc626e55e661494d63161f1b668ec1ba07dc0cd0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 09:04:48 +0000
Subject: [PATCH 111/196] update template doc

---
 .../models/jukebox/modeling_jukebox.py        | 59 +++++++++++++++----
 1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 63fa43b37ea03..48f6c56fb5d3b 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -19,6 +19,7 @@
 import os
 import sys
 import time
+from typing import List
 
 import numpy as np
 import torch
@@ -2717,7 +2718,6 @@ def load_prompts(audio_files, hps, sample_length_in_seconds=70, offset_in_second
             sampling_kwargs (`Dict[Any]`):
                 Various additional sampling arguments that are used by the `_sample` function. A detail list of the
                 arguments can bee seen in the [`_sample`] function documentation.
-
 """
 
 
@@ -2868,8 +2868,10 @@ def _sample(
         save_results=True,
         sample_length=None,
         fp16=False,
-    ):
-        """_summary_
+    ) -> List[torch.LongTensor]:
+        """
+        Core sampling function used to generate music tokens. Iterates over the provided list of levels, while saving
+        the generated raw audio at each step.
 
         Args:
             music_tokens (`__type__`): _description_
@@ -2890,7 +2892,21 @@ def _sample(
 
         Returns:
             `__type__`: _description_
-        """
+
+        Example:
+        ```python
+        >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
+        >>> tokenizer = JukeboxTokenizer.from_pretrained(self.model_id)
+        >>> model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
+
+        >>> tokens = tokenizer(**self.metas)["input_ids"]
+
+        >>> zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        >>> zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
+        [1864,
+        1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534, 1000, 1445, 1105, 1130, 967, 515, 1434, 1620,
+        534, 1495, 283, 1445, 333, 1307, 539, 1631, 1528, 375, 1434, 673, 627, 710, 778, 1883, 1405, 1276, 1455, 1228]
+        ```"""
 
         top_prior = self.priors[-1]
         if sample_length is not None:
@@ -2969,14 +2985,34 @@ def _sample(
         return music_tokens
 
     @add_start_docstrings(
-        """
+        r"""
         Args:
         Generate music tokens based on the provided `labels. Will start at the desired prior level and automatically
         upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
         the VQ-VAE decoder to convert the music tokens to raw audio.""",
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import JukeboxTokenizer, JukeboxModel
+
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics")
+        >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+
+        >>> lyrics = "Hey, are you awake? Can you talk to me?"
+        >>> artist = "Zac Brown Band"
+        >>> genre = ("Country",)
+        >>> metas = tokenizer(artist=artist, genre=genre, lyrics=lyrics)
+
+        >>> # Generate
+        >>> music_tokens = model.ancestral_sample(metas, sample_length_in_seconds=2)
+        >>> model.decode(music_tokens)[:, :, 30]
+        [...,...,...]
+        ```
+        """,
     )
-    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
+    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch.LongTensor]:
 
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         music_tokens = [
@@ -2995,7 +3031,7 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs):
         """,
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
     )
-    def continue_sample(self, music_tokens, labels, **sampling_kwargs):
+    def continue_sample(self, music_tokens, labels, **sampling_kwargs) -> List[torch.LongTensor]:
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
@@ -3007,10 +3043,11 @@ def continue_sample(self, music_tokens, labels, **sampling_kwargs):
             music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
+
         """,
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
     )
-    def upsample(self, music_tokens, labels, **sampling_kwargs):
+    def upsample(self, music_tokens, labels, **sampling_kwargs) -> List[torch.LongTensor]:
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors) - 1)))
         music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
         return music_tokens
@@ -3019,15 +3056,15 @@ def upsample(self, music_tokens, labels, **sampling_kwargs):
         """
         Args:
         Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
-        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are used:
-        as conditioning for each level, which means that no ancestral sampling is required.
+        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are
+        used: as conditioning for each level, which means that no ancestral sampling is required.
             raw_audio (`List[torch.Tensor`] of length `n_samples` ) :
                 A list of raw audio that will be used as conditioning information for each samples that will be
                 generated.
         """,
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
     )
-    def primed_sample(self, raw_audio, labels, **sampling_kwargs):
+    def primed_sample(self, raw_audio, labels, **sampling_kwargs) -> List[torch.LongTensor]:
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         self.vqvae.to(raw_audio.device).float()
         with torch.no_grad():

From 2eadac7b1033946871c91f4c8af27e4319f6e7b3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 11:29:44 +0000
Subject: [PATCH 112/196] fix doc

---
 .../models/jukebox/modeling_jukebox.py        | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 48f6c56fb5d3b..662d998414a7d 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2985,13 +2985,14 @@ def _sample(
         return music_tokens
 
     @add_start_docstrings(
-        r"""
-        Args:
-        Generate music tokens based on the provided `labels. Will start at the desired prior level and automatically
+        """
+        Generates music tokens based on the provided `labels. Will start at the desired prior level and automatically
         upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
-        the VQ-VAE decoder to convert the music tokens to raw audio.""",
+        the VQ-VAE decoder to convert the music tokens to raw audio.
+        
+        Args:""",
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
-        r"""
+        """
         Example:
 
         ```python
@@ -3022,9 +3023,9 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
         return music_tokens
 
     @add_start_docstrings(
-        """
+        """Generates a continuation of the previously generated tokens.
+        
         Args:
-        Generate a continuation of the previously generated tokens.
             music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
@@ -3037,9 +3038,9 @@ def continue_sample(self, music_tokens, labels, **sampling_kwargs) -> List[torch
         return music_tokens
 
     @add_start_docstrings(
-        """
+        """Upsamples a sequence of music tokens using the prior at level `level`.
+
         Args:
-        Upsamples a sequence of music tokens using the prior at level `level`.
             music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
@@ -3053,11 +3054,11 @@ def upsample(self, music_tokens, labels, **sampling_kwargs) -> List[torch.LongTe
         return music_tokens
 
     @add_start_docstrings(
-        """
-        Args:
-        Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
+        """Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
         generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are
         used: as conditioning for each level, which means that no ancestral sampling is required.
+        
+        Args:
             raw_audio (`List[torch.Tensor`] of length `n_samples` ) :
                 A list of raw audio that will be used as conditioning information for each samples that will be
                 generated.

From 374b16737333c7a93a68ea3fdee400875259f4c8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 11:31:17 +0000
Subject: [PATCH 113/196] test documentation rendering

---
 src/transformers/models/jukebox/modeling_jukebox.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 662d998414a7d..be45f24ddcee7 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2989,7 +2989,7 @@ def _sample(
         Generates music tokens based on the provided `labels. Will start at the desired prior level and automatically
         upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
         the VQ-VAE decoder to convert the music tokens to raw audio.
-        
+
         Args:""",
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
         """
@@ -3024,7 +3024,7 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
 
     @add_start_docstrings(
         """Generates a continuation of the previously generated tokens.
-        
+
         Args:
             music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
@@ -3057,7 +3057,7 @@ def upsample(self, music_tokens, labels, **sampling_kwargs) -> List[torch.LongTe
         """Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
         generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are
         used: as conditioning for each level, which means that no ancestral sampling is required.
-        
+
         Args:
             raw_audio (`List[torch.Tensor`] of length `n_samples` ) :
                 A list of raw audio that will be used as conditioning information for each samples that will be

From b96c50d28ffc1b2e0ecbd8c2b40e87c70ca8061b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 12:23:45 +0000
Subject: [PATCH 114/196] update doc and config doc

---
 .../models/jukebox/configuration_jukebox.py   |  20 +--
 .../models/jukebox/modeling_jukebox.py        | 137 +++++++++++-------
 2 files changed, 97 insertions(+), 60 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index f96c2b9159e82..acd713d14b72f 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -52,8 +52,8 @@ class JukeboxConfig(PretrainedConfig):
             trained using a top prior and 2 upsampler priors.
         timing_dims (`int`, *optional*, defaults to 64):
             Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
-            layer. #TODO the timing embedding layer converts the absolute and relative position in the currently
-            sampled audio to a tensor of lenght `timing_dims` that will be added to the music tokens.
+            layer. The timing embedding layer converts the absolute and relative position in the currently sampled
+            audio to a tensor of lenght `timing_dims` that will be added to the music tokens.
         single_enc_dec (`list`, *optional*, defaults to [True, False, False]):
             Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
             `lyric_encoder` for each of the priors.
@@ -61,7 +61,7 @@ class JukeboxConfig(PretrainedConfig):
             Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
             duration.
         merged_decoder (`list`, *optional*, defaults to [True, False, False]):
-            # FIXME is that the same as single_enc_dec ??
+            Whether or not the decoder is merged with the encoder.
         lyric_conditioning (`list`, *optional*, defaults to [True, False, False]):
             Whether or not to use the lyrics as conditioning.
         nb_relevant_lyric_tokens (`list`, *optional*, defaults to [384, 0, 0]):
@@ -75,7 +75,7 @@ class JukeboxConfig(PretrainedConfig):
         init_std (`float`, *optional*, defaults to 0.2):
             Standard deviation used to inital the model.
         hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
-            # TODO detail this amount of space between each of the sampling windows oif `n_ctx` tokens
+            Fraction of non-intersecting window used when continuing the sampling process.
         cond_zero_out (`bool`, *optional*, defaults to False):
             Zero out weights when initialising.
         cond_depth (`list`, *optional*, defaults to [3, 16, 16]):
@@ -92,21 +92,23 @@ class JukeboxConfig(PretrainedConfig):
         cond_m_conv (`int`, *optional*, defaults to 1):
             Conditionner multiplier (the input states are mulitplied by that parameter for each convolution.
         cond_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
-            Downsampling ... # TODO
+            Downsampling rates used in the audio conditioning network
         cond_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
-            Striding pattern to use #TODO
+            Striding used in the audio conditioning network
         lyric_enc_spread (`bool`, *optional*, defaults to False):
             Spread used in the attention pattern #TODO check what that is actually
         lyric_enc_width (`list`, *optional*, defaults to [128, 128, 128]):
             Width of the lyric encoder
         lyric_enc_depth (`list`, *optional*, defaults to [18, 3, 3]):
-            Number of blocks used in the lyric encoder is this different from lyric_enc_blocks? FIXME
+            Number of encoder blocks used in the lyric encoder
         lyric_enc_heads (`int`, *optional*, defaults to 4):
             Number of heads in the lyric encoder
         lyric_enc_m_attn (`float`, *optional*, defaults to 0.25):
-            # again, m_attn and m_mlp, I don't really know how to rename it
+            Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
+            0.25*width of the model will be used.
         lyric_enc_m_mlp (`float`, *optional*, defaults to 1.0):
-            # again, m_attn and m_mlp, I don't really know how to rename it
+            Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
+            the model will be used.
         lyric_enc_blocks (`int`, *optional*, defaults to 32):
 
         lyric_enc_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index be45f24ddcee7..107ae1f9dbdb1 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -91,12 +91,12 @@ def forward(self, hidden_states):
 
 
 class JukeboxResConv1DBlock(nn.Module):
-    def __init__(self, n_in, n_state, dilation=1, zero_out=False, res_scale=1.0):
+    def __init__(self, n_in, hidden_dim, dilation=1, zero_out=False, res_scale=1.0):
         super().__init__()
         padding = dilation
         self.relu = nn.ReLU()
-        self.conv1d_1 = nn.Conv1d(n_in, n_state, 3, 1, padding, dilation)
-        self.conv1d_2 = nn.Conv1d(n_state, n_in, 1, 1, 0)
+        self.conv1d_1 = nn.Conv1d(n_in, hidden_dim, 3, 1, padding, dilation)
+        self.conv1d_2 = nn.Conv1d(hidden_dim, n_in, 1, 1, 0)
         self.res_scale = res_scale
 
     def forward(self, hidden_states):
@@ -512,19 +512,31 @@ def forward(self, input_audio):
         return music_tokens, quantised_states, commit_losses, metrics
 
 
-class JukeboxVQVAE(PreTrainedModel):
-    """
+JUKEBOX_START_DOCSTRING = r"""
 
-    Args:
-        PreTrainedModel (`__type__`): _description_
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
 
-    Raises:
-        NotImplementedError: _description_ TypeError: _description_
+    Parameters:
+        config (`JukeboxConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
 
-    Returns:
-        `__type__`: _description_
-    """
 
+@add_start_docstrings(
+    """The Hierarchical VQ-VAE model used in Jukebox. This model follows the Hierarchical VQVAE paper from [Will Williams, Sam
+Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://arxiv.org/abs/2002.08111).
+
+    """,
+    JUKEBOX_START_DOCSTRING,
+)
+class JukeboxVQVAE(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         if not config.sample_length:
@@ -624,6 +636,18 @@ def _decode(self, music_tokens, start_level=0, end_level=None):
         return dequantised_state
 
     def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):
+        """
+        _summary_
+
+        Args:
+            music_tokens (_type_): _description_
+            start_level (int, optional): _description_. Defaults to 0.
+            end_level (_type_, optional): _description_. Defaults to None.
+            bs_chunks (int, optional): _description_. Defaults to 1.
+
+        Returns:
+            _type_: _description_
+        """
         token_chunks = [torch.chunk(token, bs_chunks, dim=0) for token in music_tokens]
         dequantised_states = []
         for i in range(bs_chunks):
@@ -646,6 +670,19 @@ def _encode(self, raw_audio, start_level=0, end_level=None):
         return music_tokens[start_level:end_level]
 
     def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
+        """
+        _summary_
+
+        Args:
+            input_audio (_type_): _description_
+            start_level (int, optional): _description_. Defaults to 0.
+            end_level (_type_, optional): _description_. Defaults to None.
+            bs_chunks (int, optional): _description_. Defaults to 1.
+
+
+        Returns:
+            _type_: _description_
+        """
         audio_chunks = torch.chunk(input_audio, bs_chunks, dim=0)
         music_tokens_list = []
         for chunk_i in audio_chunks:
@@ -662,6 +699,29 @@ def sample(self, n_samples):
         return self.decode(music_tokens)
 
     def forward(self, raw_audio):
+        """
+        Forward pass of the VQ-VAE, encodes the `raw_audio` to latent states, which are then decoded for each level.
+        The commit loss, which ensure that the encoder's computed embeddings are close to the codebook vectors, is
+        computed.
+
+
+        Args:
+            raw_audio (`torch.FloatTensor`):
+                Audio input which will be encoded and decoded.
+
+
+        Returns:
+            `Tuple[torch.Tensor, torch.Tensoor`
+
+
+        Example:
+        ```python
+        >>> model = JukeboxVQVAE.from_pretrained(self.model_id).eval()
+
+        >>> zs = [torch.random(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        >>> zs = model(zs)
+        ```"""
+
         # Encode/Decode
         input_audio = self.preprocess(raw_audio)
         latent_states = []
@@ -687,11 +747,11 @@ def forward(self, raw_audio):
 
 
 class JukeboxMLP(nn.Module):
-    def __init__(self, width, n_state, resid_dropout=0.0, afn="gelu", zero_out=False, init_scale=1.0):
+    def __init__(self, width, hidden_dim, resid_dropout=0.0, afn="gelu", zero_out=False, init_scale=1.0):
         # a single channel is always used in original code
         super().__init__()
-        self.c_fc = JukeboxConv1D(width, n_state)
-        self.c_proj = JukeboxConv1D(n_state, width, zero_out)
+        self.c_fc = JukeboxConv1D(width, hidden_dim)
+        self.c_proj = JukeboxConv1D(hidden_dim, width, zero_out)
         self.act = ACT2FN[afn]
         self.dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
 
@@ -747,7 +807,7 @@ def __init__(
         self,
         width,
         n_ctx,
-        n_state,
+        hidden_dim,
         num_heads,
         attn_dropout=0.0,
         resid_dropout=0.0,
@@ -764,16 +824,16 @@ def __init__(
         super().__init__()
         self.width = width  # should have a better name
         self.n_ctx = n_ctx  # NOTE: n_ctx could be different within operations. This is complete n_ctx
-        self.n_state = n_state
+        self.hidden_dim = hidden_dim
         self.num_heads = num_heads
         self.scale = scale
         self.mask = mask
         if attn_func == 6:
-            self.c_attn = JukeboxConv1D(width, n_state)
-            self.c_enc_kv = JukeboxConv1D(width, n_state * 2)
+            self.c_attn = JukeboxConv1D(width, hidden_dim)
+            self.c_enc_kv = JukeboxConv1D(width, hidden_dim * 2)
         else:
-            self.c_attn = JukeboxConv1D(width, n_state * 3)
-        self.c_proj = JukeboxConv1D(n_state, width, zero_out)
+            self.c_attn = JukeboxConv1D(width, hidden_dim * 3)
+        self.c_proj = JukeboxConv1D(hidden_dim, width, zero_out)
         self.attn_dropout = nn.Dropout(attn_dropout) if attn_dropout > 0.0 else lambda x: x
         self.resid_dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
 
@@ -805,7 +865,7 @@ def __init__(
         self.w = None
 
     def _attn(self, query_states, key_states, value_states, sample):
-        scale = 1.0 / math.sqrt(math.sqrt(self.n_state // self.num_heads))
+        scale = 1.0 / math.sqrt(math.sqrt(self.hidden_dim // self.num_heads))
         if self.training:
             attention_weight = torch.matmul(query_states * scale, key_states * scale)
         else:
@@ -1172,8 +1232,8 @@ def check_cache(self, n_samples, sample_t, fp16):
         else:
             dtype = {True: torch.float16, False: torch.float32}[fp16]
             l_cache = self._suff_cache_len()
-            assert self.cache["key"].shape == (n_samples, l_cache, self.n_state)
-            assert self.cache["value"].shape == (n_samples, l_cache, self.n_state)
+            assert self.cache["key"].shape == (n_samples, l_cache, self.hidden_dim)
+            assert self.cache["value"].shape == (n_samples, l_cache, self.hidden_dim)
             assert self.cache["key"].dtype == dtype, f"Expected {dtype}, got {self.cache['key'].dtype}"
             assert self.cache["value"].dtype == dtype, f"Expected {dtype}, got {self.cache['value'].dtype}"
 
@@ -1204,7 +1264,7 @@ def __init__(
         self.attn = JukeboxAttention(
             width=width,
             n_ctx=n_ctx,
-            n_state=int(m_attn * width),
+            hidden_dim=int(m_attn * width),
             num_heads=num_heads,
             attn_dropout=attn_dropout,
             resid_dropout=resid_dropout,
@@ -1222,7 +1282,7 @@ def __init__(
         self.layer_norm_0 = JukeboxLayerNorm(width)
         self.mlp = JukeboxMLP(
             width=width,
-            n_state=int(m_mlp * width),
+            hidden_dim=int(m_mlp * width),
             resid_dropout=resid_dropout,
             afn=afn,
             zero_out=zero_out,
@@ -2563,25 +2623,6 @@ def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
 
-JUKEBOX_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config (`JukeboxConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-JUKEBOX_SAMPLE_INPUT_DOCSTRING = r""""""
-
-
 # Break total_length into hops/windows of size n_ctx separated by hop_length
 def get_starts(total_length, n_ctx, hop_length):
     starts = []
@@ -2593,13 +2634,7 @@ def get_starts(total_length, n_ctx, hop_length):
     return starts
 
 
-# NOTE, consumes a lot of RAM so should probably be ran on CPU
 def get_alignment(music_tokens, labels, prior, fp16, config):
-    """
-    Compute the lyric to music token alignment, but for now it cannot be used.
-
-    IN THE Oiginal code,
-    """
     level = prior.levels - 1  # Top level used
     n_ctx = prior.n_ctx
     tokens = music_tokens[level]

From b386353ad3c8b4dba709ec87a7dc453c3d0b2f31 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 13:43:44 +0000
Subject: [PATCH 115/196] remove cehck cache

---
 .../models/jukebox/modeling_jukebox.py        | 100 ++++++++----------
 1 file changed, 42 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 107ae1f9dbdb1..296f58ca961a8 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -635,18 +635,20 @@ def _decode(self, music_tokens, start_level=0, end_level=None):
         dequantised_state = self.postprocess(dequantised_state)
         return dequantised_state
 
-    def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):
+    def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1) -> torch.Tensor:
         """
-        _summary_
+        Transforms the input `music_tokens` to their `raw_audio` representation.
 
         Args:
-            music_tokens (_type_): _description_
-            start_level (int, optional): _description_. Defaults to 0.
-            end_level (_type_, optional): _description_. Defaults to None.
-            bs_chunks (int, optional): _description_. Defaults to 1.
-
-        Returns:
-            _type_: _description_
+            music_tokens (`torch.LongTensor`):
+                Tensor of music tokens which will be decoded to raw audio by using the codebook. Each music token
+                should be an index to a coresponding `code` vector in the codebook.
+            start_level (`int`, *optional*):
+                Level at which the decoding process will start. Default to 0.
+            end_level (`int`, *optional*):
+                Level at which the decoding process will start. Default to None.
+            bs_chunks (int, *optional*):
+                Number of chuncks to process at the same time.
         """
         token_chunks = [torch.chunk(token, bs_chunks, dim=0) for token in music_tokens]
         dequantised_states = []
@@ -671,17 +673,18 @@ def _encode(self, raw_audio, start_level=0, end_level=None):
 
     def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
         """
-        _summary_
+        Transforms the `input_audio` to a discrete representation made out of `music_tokens`.
 
         Args:
-            input_audio (_type_): _description_
-            start_level (int, optional): _description_. Defaults to 0.
-            end_level (_type_, optional): _description_. Defaults to None.
-            bs_chunks (int, optional): _description_. Defaults to 1.
-
-
-        Returns:
-            _type_: _description_
+            input_audio (`torch.Tensor`):
+                Raw audio which will be encoded to its discrete representation using the codebook. The closest `code`
+                form the codebook will be computed for each sequence of samples.
+            start_level (`int`, *optional*):
+                Level at which the encoding process will start. Default to 0.
+            end_level (`int`, *optional*):
+                Level at which the encoding process will start. Default to None.
+            bs_chunks (int, *optional*):
+                Number of chuncks of raw audio to process at the same time.
         """
         audio_chunks = torch.chunk(input_audio, bs_chunks, dim=0)
         music_tokens_list = []
@@ -1225,18 +1228,6 @@ def del_cache(self):
             del self.cache["value"]
         self.cache = {}
 
-    def check_cache(self, n_samples, sample_t, fp16):
-        assert self.sample_t == sample_t, f"{self.sample_t} != {sample_t}"
-        if sample_t == 0:
-            assert self.cache == {}
-        else:
-            dtype = {True: torch.float16, False: torch.float32}[fp16]
-            l_cache = self._suff_cache_len()
-            assert self.cache["key"].shape == (n_samples, l_cache, self.hidden_dim)
-            assert self.cache["value"].shape == (n_samples, l_cache, self.hidden_dim)
-            assert self.cache["key"].dtype == dtype, f"Expected {dtype}, got {self.cache['key'].dtype}"
-            assert self.cache["value"].dtype == dtype, f"Expected {dtype}, got {self.cache['value'].dtype}"
-
 
 class JukeboxBlock(nn.Module):
     def __init__(
@@ -1432,10 +1423,6 @@ def forward(self, hidden_states, lyric_encoder_states=None, sample=False, fp16=F
             hidden_states = hidden_states.float()
         return hidden_states
 
-    def check_cache(self, n_samples, sample_t, fp16):
-        for attn_layer in self._attn_mods:
-            attn_layer.attn.check_cache(n_samples, sample_t, fp16)
-
     def del_cache(self):
         for attn_layer in self._attn_mods:
             attn_layer.attn.del_cache()
@@ -1691,7 +1678,7 @@ def sample(
                 hidden_states, cond = self.get_emb(
                     sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning
                 )
-                self.transformer.check_cache(n_samples, sample_t, fp16)
+
                 hidden_states = self.transformer(
                     hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16, fp16_out=fp16
                 )
@@ -1801,17 +1788,13 @@ def primed_sample(
 
             gc.collect()
             torch.cuda.empty_cache()
-            self.transformer.check_cache(n_samples, len(sampled_audio), fp16)
-
             hidden_states = sampled_audio[-1]
-            gc.collect()
-            torch.cuda.empty_cache()
 
             for sample_t in get_range(range(len(sampled_audio), sample_tokens)):
                 hidden_states, cond = self.get_emb(
                     sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning
                 )
-                self.transformer.check_cache(n_samples, sample_t, fp16)
+
                 hidden_states = self.transformer(
                     hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16, fp16_out=fp16
                 )  # Transformer
@@ -1917,8 +1900,10 @@ def postprocess(self, hidden_states):
     def forward(self, music_tokens, raw_audio_conditionning=None):
         """
         Args :
-            - music_tokens : int or long, in range(codebook_dim)
-            - raw_audio_conditionning : used when primed sampling, raw audio information that conditions
+            music_tokens (`torch.LongTensor`):
+                Music tokens form the uper level in range(codebook_dim)
+            raw_audio_conditionning (`torch.LongTensor`):
+                Audio used when primed sampling, raw audio information that conditions
             the generation
         """
         if raw_audio_conditionning is None:
@@ -1937,7 +1922,7 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
 
 
 class JukeboxSimpleEmbedding(nn.Module):
-    def __init__(self, embed_dim, out_width, init_scale):
+    def __init__(self, embed_dim, out_width):
         super().__init__()
         self.embed_dim = embed_dim
         self.emb = nn.Embedding(embed_dim, out_width)
@@ -2017,8 +2002,8 @@ def __init__(
         self.out_width = out_width
         nb_genres, nb_artists = metadata_dims
         self.max_nb_genres = max_nb_genres
-        self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width, init_scale)
-        self.artist_emb = JukeboxSimpleEmbedding(nb_artists, out_width, init_scale)
+        self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width)
+        self.artist_emb = JukeboxSimpleEmbedding(nb_artists, out_width)
         self.include_time_signal = include_time_signal
         if self.include_time_signal:
             t_ranges = (
@@ -2026,7 +2011,6 @@ def __init__(
                 (0.0, max_duration * sampling_rate),  # Absolute pos
                 (0.0, 1.0),
             )  # Relative pos
-            assert len(t_ranges) == 3, f"Expecting (total, absolute, relative) ranges, got {t_ranges}"
             total_length_range, absolute_pos_range, relative_pos_range = t_ranges
             self.total_length_emb = JukeboxRangeEmbedding(1, timing_dims, total_length_range, out_width, init_scale)
             self.absolute_pos_emb = JukeboxRangeEmbedding(
@@ -2912,18 +2896,18 @@ def _sample(
             music_tokens (`__type__`): _description_
             labels (`__type__`): _description_
             sample_levels (`__type__`): _description_
-            metas (`__type__`, optional): _description_. Defaults to None.
-            chunk_size (int, optional): _description_. Defaults to 32.
-            sampling_temperature (float, optional): _description_. Defaults to 0.98.
-            lower_batch_size (int, optional): _description_. Defaults to 16.
-            max_batch_size (int, optional): _description_. Defaults to 16.
-            sample_length_in_seconds (int, optional): _description_. Defaults to 24.
-            alignments (`__type__`, optional): _description_. Defaults to None.
-            sample_tokens (`__type__`, optional): _description_. Defaults to None.
-            offset (int, optional): _description_. Defaults to 0.
-            save_results (bool, optional): _description_. Defaults to True.
-            sample_length (`__type__`, optional): _description_. Defaults to None.
-            fp16 (bool, optional): _description_. Defaults to False.
+            metas (`__type__`, *optional*): _description_. Defaults to None.
+            chunk_size (int, *optional*): _description_. Defaults to 32.
+            sampling_temperature (float, *optional*): _description_. Defaults to 0.98.
+            lower_batch_size (int, *optional*): _description_. Defaults to 16.
+            max_batch_size (int, *optional*): _description_. Defaults to 16.
+            sample_length_in_seconds (int, *optional*): _description_. Defaults to 24.
+            alignments (`__type__`, *optional*): _description_. Defaults to None.
+            sample_tokens (`__type__`, *optional*): _description_. Defaults to None.
+            offset (int, *optional*): _description_. Defaults to 0.
+            save_results (bool, *optional*): _description_. Defaults to True.
+            sample_length (`__type__`, *optional*): _description_. Defaults to None.
+            fp16 (bool, *optional*): _description_. Defaults to False.
 
         Returns:
             `__type__`: _description_

From 7c321d8d62d740502b2491c0150b240779d420a9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 14:50:37 +0000
Subject: [PATCH 116/196] update tokenization doc and remove asserts

---
 .../models/jukebox/modeling_jukebox.py        | 21 +++++++-----
 .../models/jukebox/tokenization_jukebox.py    | 34 ++++++++-----------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 296f58ca961a8..cd2212612f681 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2302,11 +2302,12 @@ def get_music_tokens_conds(self, music_tokens, start, end):
         Extracts current level's conditioning music tokens.
         """
         if self.level != self.levels - 1:
-            assert start % self.cond_downsample == end % self.cond_downsample == 0
             music_tokens_cond = music_tokens[self.level + 1][
                 :, start // self.cond_downsample : end // self.cond_downsample
             ]
-            assert music_tokens_cond.shape[1] == self.n_ctx // self.cond_downsample
+            missing_cond_len = self.n_ctx // self.cond_downsample  - music_tokens_cond[-1].shape[-1]
+            if (missing_cond_len > 0 ):
+                music_tokens_cond = torch.cat((music_tokens_cond, torch.zeros(1, missing_cond_len).to(music_tokens_cond.device)), dim=-1).long()
             music_tokens_conds = [music_tokens_cond]
         else:
             music_tokens_conds = None
@@ -3017,20 +3018,22 @@ def _sample(
         ```python
         >>> from transformers import JukeboxTokenizer, JukeboxModel
 
-        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics")
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0)
+        Level:0, Cond downsample:4, Raw to tokens:8, Sample length:65536
+        Level:1, Cond downsample:4, Raw to tokens:32, Sample length:262144
+        Level:2, Cond downsample:None, Raw to tokens:128, Sample length:786432
         >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
 
         >>> lyrics = "Hey, are you awake? Can you talk to me?"
         >>> artist = "Zac Brown Band"
-        >>> genre = ("Country",)
-        >>> metas = tokenizer(artist=artist, genre=genre, lyrics=lyrics)
+        >>> genre = "Country"
+        >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
 
-        >>> # Generate
-        >>> music_tokens = model.ancestral_sample(metas, sample_length_in_seconds=2)
+        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2)
         >>> model.decode(music_tokens)[:, :, 30]
-        [...,...,...]
+
         ```
-        """,
+        """
     )
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch.LongTensor]:
 
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 485ab6fc57c7f..6f7445a66bce7 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -88,20 +88,16 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
             Expected duration of the generated music, in samples. The duration has to be smaller than the total lenght,
             which represent the overall length of the signal,
     """
-
+    import torch
     full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
         tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens)), full_tokens])
         indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
     else:
-        assert 0 <= offset < total_length
         midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
         midpoint = min(max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2)
         tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2]
         indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
-    # assert len(tokens) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(tokens)}"
-    # assert len(indices) == max_n_lyric_tokens, f"Expected length {max_n_lyric_tokens}, got {len(indices)}"
-    # # assert tokens == [full_tokens[index] if index != -1 else 0 for index in indices]
     return tokens.unsqueeze(dim=0), indices
 
 
@@ -119,11 +115,12 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     Depending on the number of genres on which the model should be conditioned (`n_genres`).
     ```
     >>> from transformers import JukeboxTokenizer
-    >>> tokenizer = JukeboxTokenizer.from_pretrained("jukebox")
+    >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
     >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
-    ## TODO UPDATE THIS OUTPUT
-    >>> tokenizer("Alan Jackson", "Country Rock")['input_ids']
-    [6785],[546]]
+    [tensor([[  0,   0,   0, 145,   0]]),
+     tensor([[  0,   0,   0, 145,   0]]),
+     tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
+                40,   76,   44,   41,   27,   30]])]
     ```
 
     You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
@@ -144,6 +141,9 @@ class JukeboxTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             Path to the vocabulary file which should contain a dictionnary where the keys are 'artist', 'genre' and
             'lyrics' and the values are their corresponding vocabulary files.
+        version (`List[`str`], `optional`, default to ["v3", "v2", "v2"]) : 
+            List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of `v2`. 
+
         n_genres (`int`, `optional`, defaults to 1):
             Maximum number of genres to use for composition.
         max_n_lyric_tokens (`int`, `optional`, defaults to 512):
@@ -399,20 +399,16 @@ def convert_to_tensors(
 
         return inputs
 
-    def __call__(self, artist, genres, lyrics, return_tensors="pt") -> BatchEncoding:
+    def __call__(self, artist, genres, lyrics = "", return_tensors="pt") -> BatchEncoding:
         """Convert the raw string to a list of token ids
 
         Args:
             artist (`str`):
-                _description_
-            genre (`str`):
-                _description_
-            lyrics (`srt`):
-                _description_
-            total_length (`int`):
-                _description_
-            offset (`_type_`):
-                _description_
+                Name of the artist. 
+            genres (`str`):
+                List of genres that will be mixed to condition the audio
+            lyrics (`srt`, Optional):
+                Lyrics used to condition the generation
         """
         input_ids = [0, 0, 0]
         artist = [artist] * len(self.version)

From b93b078c66ef3ef88aa397b68e394e876d403ec7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 15:08:11 +0000
Subject: [PATCH 117/196] update test

---
 .../models/jukebox/modeling_jukebox.py        | 12 ++++++----
 .../models/jukebox/tokenization_jukebox.py    | 11 +++++----
 tests/models/jukebox/test_modeling_jukebox.py | 24 ++-----------------
 3 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index cd2212612f681..1e4948b79986d 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2305,9 +2305,11 @@ def get_music_tokens_conds(self, music_tokens, start, end):
             music_tokens_cond = music_tokens[self.level + 1][
                 :, start // self.cond_downsample : end // self.cond_downsample
             ]
-            missing_cond_len = self.n_ctx // self.cond_downsample  - music_tokens_cond[-1].shape[-1]
-            if (missing_cond_len > 0 ):
-                music_tokens_cond = torch.cat((music_tokens_cond, torch.zeros(1, missing_cond_len).to(music_tokens_cond.device)), dim=-1).long()
+            missing_cond_len = self.n_ctx // self.cond_downsample - music_tokens_cond[-1].shape[-1]
+            if missing_cond_len > 0:
+                music_tokens_cond = torch.cat(
+                    (music_tokens_cond, torch.zeros(1, missing_cond_len).to(music_tokens_cond.device)), dim=-1
+                ).long()
             music_tokens_conds = [music_tokens_cond]
         else:
             music_tokens_conds = None
@@ -3022,6 +3024,7 @@ def _sample(
         Level:0, Cond downsample:4, Raw to tokens:8, Sample length:65536
         Level:1, Cond downsample:4, Raw to tokens:32, Sample length:262144
         Level:2, Cond downsample:None, Raw to tokens:128, Sample length:786432
+
         >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
 
         >>> lyrics = "Hey, are you awake? Can you talk to me?"
@@ -3031,9 +3034,8 @@ def _sample(
 
         >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2)
         >>> model.decode(music_tokens)[:, :, 30]
-
         ```
-        """
+        """,
     )
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch.LongTensor]:
 
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 6f7445a66bce7..24a8ef108460c 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -89,6 +89,7 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
             which represent the overall length of the signal,
     """
     import torch
+
     full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
         tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens)), full_tokens])
@@ -141,9 +142,9 @@ class JukeboxTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             Path to the vocabulary file which should contain a dictionnary where the keys are 'artist', 'genre' and
             'lyrics' and the values are their corresponding vocabulary files.
-        version (`List[`str`], `optional`, default to ["v3", "v2", "v2"]) : 
-            List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of `v2`. 
-
+        version (`List[`str`], `optional`, default to ["v3", "v2", "v2"]) :
+            List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
+            `v2`.
         n_genres (`int`, `optional`, defaults to 1):
             Maximum number of genres to use for composition.
         max_n_lyric_tokens (`int`, `optional`, defaults to 512):
@@ -399,12 +400,12 @@ def convert_to_tensors(
 
         return inputs
 
-    def __call__(self, artist, genres, lyrics = "", return_tensors="pt") -> BatchEncoding:
+    def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncoding:
         """Convert the raw string to a list of token ids
 
         Args:
             artist (`str`):
-                Name of the artist. 
+                Name of the artist.
             genres (`str`):
                 List of genres that will be mixed to condition the audio
             lyrics (`srt`, Optional):
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 22e4c157a2ecf..f857daf0b70b2 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -138,16 +138,11 @@ def test_sampling(self):
         zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
-        zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
-        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
-        zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
-
         set_seed(0)
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
@@ -174,16 +169,11 @@ def test_slow_sampling(self):
         zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
 
-        zs[-1] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_2).unsqueeze(0)
         set_seed(0)
-        zs[-1] = torch.cat((zs[-1].cuda(), torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
         zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
 
-        zs[-2] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_1).unsqueeze(0)
-
         set_seed(0)
-        zs[-2] = torch.cat((zs[-2].cuda(), torch.zeros(1, 1000000 - zs[-2].shape[-1]).cuda()), dim=-1).long()
         zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
@@ -311,46 +301,36 @@ def prepare_inputs(self, model_id):
     @slow
     def test_sampling(self):
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-
         labels = self.prepare_inputs(self.model_id)
+
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
-        zs[-1] = torch.tensor(self.EXPECTED_OUTPUT_2).unsqueeze(0)
         set_seed(0)
-        zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
-        zs[-2] = torch.tensor(self.EXPECTED_OUTPUT_1).unsqueeze(0)
-
         set_seed(0)
-        zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).cpu()), dim=-1).long()
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     def test_slow_sampling(self):
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval().to("cuda")
-
         labels = [i.cuda() for i in self.prepare_inputs(self.model_id)]
+
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
 
-        zs[-1] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_2).unsqueeze(0)
         set_seed(0)
-        zs[-1] = torch.cat((zs[-1].cuda(), torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
         zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
 
-        zs[-2] = torch.tensor(self.EXPECTED_GPU_OUTPUTS_1).unsqueeze(0)
-
         set_seed(0)
-        zs[-2] = torch.cat((zs[-2].cuda(), torch.zeros(1, 1000000 - zs[-2].shape[-1]).cuda()), dim=-1).long()
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 

From 49b2375ecf0f498f254a6dcd8508ce86e940801f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 20 Sep 2022 08:21:09 +0000
Subject: [PATCH 118/196] improve config doc, remove unused args in vqvae, fix
 torch issue

---
 docs/source/en/model_doc/jukebox.mdx          | 14 ++-
 .../models/jukebox/configuration_jukebox.py   | 94 +++++++++----------
 .../models/jukebox/modeling_jukebox.py        | 80 +++++++++-------
 .../models/jukebox/tokenization_jukebox.py    | 45 +--------
 4 files changed, 99 insertions(+), 134 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index b6039ac1c6dbb..1a83d1287e68c 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -15,17 +15,15 @@ specific language governing permissions and limitations under the License.
 
 The Jukebox model was proposed in [Jukebox: A generative model for music](https://arxiv.org/pdf/2005.00341.pdf)
 by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford,
-Ilya Sutskever.
-
-This model proposes a generative music model which can be produce minute long samples which can bne conditionned on
-artist, genre and lyrics.
+Ilya Sutskever. It introduces a generative music model which can produce minute long samples that can be conditionned on
+an artist, genres and lyrics.
 
 The abstract from the paper is the following:
 
-We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.
+*We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
 
-As shown on the following figure, Jukebox is made of 3 `priors` which are decoders only. They follow a particular architecture described in `Scalable Transformers` #TODO add link to the paper. 
-An encoder model is used on the lyrics, on the first (also called `top_prior`) prior's decoder attends to the last lyric hidden states. Each prior is linked to the previous by an `AudioConditionner` module which takes care of upsampling the generated hidden state to the correct `raw_to_token` resolution.
+As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only model. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length. 
+An encoder model is used on the lyrics: the first (also called `top_prior`) prior's decoder attends to the last hidden states extracted from the `lyric_encoder`. Each prior is linked to the previous by an `AudioConditionner` module which takes care of upsampling the generated dequantised states to the correct `raw_to_token` resolution of the current prior.
 The metadatas such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. 
 
 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/2a75e9ab330ef27ae6565fcf28b129e033ff9bed/Jukebox.svg)
@@ -49,8 +47,8 @@ The original code can be found [here](https://github.com/openai/jukebox).
 ## JukeboxModel
 
 [[autodoc]] JukeboxModel 
-    - primed_sample
     - ancestral_sample 
+    - primed_sample
     - continue_sample
     - upsample
     - _sample
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index acd713d14b72f..51253474c22f7 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -96,7 +96,7 @@ class JukeboxConfig(PretrainedConfig):
         cond_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
             Striding used in the audio conditioning network
         lyric_enc_spread (`bool`, *optional*, defaults to False):
-            Spread used in the attention pattern #TODO check what that is actually
+            Spread used in the attention pattern
         lyric_enc_width (`list`, *optional*, defaults to [128, 128, 128]):
             Width of the lyric encoder
         lyric_enc_depth (`list`, *optional*, defaults to [18, 3, 3]):
@@ -110,97 +110,96 @@ class JukeboxConfig(PretrainedConfig):
             Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
             the model will be used.
         lyric_enc_blocks (`int`, *optional*, defaults to 32):
-
+            Sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
         lyric_enc_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):
-
+            Initialisation scales for the lyric encoder modules.
         lyric_enc_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
-
+            Multiplication factor used in front of the lyric encoder loss. Each value is for a particular level.
         lyric_enc_attn_order (`list`, *optional*, defaults to [2, 0, 0]):
-            Which attention pattern to use for the lyric encoder
+            Which attention pattern to use for the lyric encoder.
         lyric_enc_attn_dropout (`float`, *optional*, defaults to 0.0):
-
+            Dropout probability for the post-attention layer dropout in the lyric encoder.
         lyric_enc_resid_dropout (`float`, *optional*, defaults to 0.0):
-
+            Residual dropout used in the attention pattern of the lyric encoder.
         lyric_enc_emb_dropout (`float`, *optional*, defaults to 0.0):
-
+            Embedding dropout used in the lyric encoder.
         lyric_enc_zero_out (`bool`, *optional*, defaults to False):
-
+            Whether or not to set to zeros the weights the MLPs in the lyric encoder.
         lyric_enc_res_scale (`bool`, *optional*, defaults to False):
-
-        lyric_enc_pos_init (`bool`, *optional*, defaults to False):
-
+            Residual scaling factor used in the lyric encoder attention patterns.
         lyric_enc_n_vocab (`int`, *optional*, defaults to 79):
-
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed to the
+            `lyric_encoder`
         prior_init_scale (`list`, *optional*, defaults to [0.2, 1, 1]):
-
+            Initialisation scales for the prior modules.
         prior_spread (`bool`, *optional*, defaults to False):
-
+            Spread used in the attention pattern
         prior_zero_out (`bool`, *optional*, defaults to False):
-
+             Whether or not to set to zeros the weights the MLPs of the priors.
         prior_res_scale (`bool`, *optional*, defaults to False):
-
-        prior_pos_init (`bool`, *optional*, defaults to False):
-
+            Residual scaling factor used in every prior's attention layer.
         prior_n_ctx (`tuple`, *optional*, defaults to (6144, 8192, 8192)):
             Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
             generating music tokens.
         prior_latent_dim (`int`, *optional*, defaults to 2048):
             Dimension of the latent music token space. Default value match the `vqvae_codebook_dimension`.
         prior_width (`list`, *optional*, defaults to [2048, 1920, 1920]):
-
+            Input and output dimension of the attention layers of each prior.
+        prior_m_attn (`float`, *optional*, defaults to 0.25):
+            Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
+            0.25*prior_width of the model will be used.
         prior_depth (`list`, *optional*, defaults to [72, 72, 72]):
-
+            Depth of each prior. Defines the number of `attn_block`.
         prior_n_heads (`list`, *optional*, defaults to [2, 1, 1]):
-
+            Number of attention heads per prior.
         prior_attn_order (`list`, *optional*, defaults to [12, 2, 2]):
             Attention patterns to use in each prior. Depending on the value, cross attention, block attention and
             sparse attention blocks are stacked.
         prior_blocks (`int`, *optional*, defaults to 64):
-
+            Sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
         prior_alignment_layer (`list`, *optional*, defaults to [68, None, None]):
             Layer corresponding to the alignemnt between the lyrics and the audio.
         prior_alignment_head (`list`, *optional*, defaults to [2, None, None]):
             Index of the attention head which takes care of the alignemnt between the lyrics and the audio.
-        prior_m_attn (`float`, *optional*, defaults to 0.25):
-
         prior_attn_dropout (`int`, *optional*, defaults to 0):
-
+            Dropout probability for the post-attention layer dropout of the prior models.
         prior_resid_dropout (`int`, *optional*, defaults to 0):
-
+            Residual dropout probability used in the attention layers of the prior models.
         prior_emb_dropout (`int`, *optional*, defaults to 0):
-
+            Dropout applied to the embedding layer of the priors.
         vqvae_levels (`int`, *optional*, defaults to 3):
             Number of hierachical levels that used in the VQVAE.
         vqvae_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
-
+            Downsampling rate for each level of the hierachical VQ-VAE.
         vqvae_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
-
+            Stride used for each level of the hierachical VQ-VAE.
         vqvae_emmbedding_width (`int`, *optional*, defaults to 64):
             Dimension of the codebook vectors.
         vqvae_codebook_dimension (`int`, *optional*, defaults to 2048):
             Number of codes to use in each of the VQVAE.
-        vqvae_width (`int`, *optional*, defaults to 32):
-
-        vqvae_depth (`int`, *optional*, defaults to 4):
-
         vqvae_m_conv (`int`, *optional*, defaults to 1):
-
+            Projection factor used in the `JukeboxResConv1DBlock`.
         vqvae_dilation_growth_rate (`int`, *optional*, defaults to 3):
-
-        vqvae_dilation_cycle (`bool`, *optional*, defaults to False):
-
+            Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
+        vqvae_dilation_cycle (`int`, *optional*, defaults to None):
+            Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
+            of reduced by a power of `vqvae_dilation_cycle`.
         vqvae_multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
-
+            Depth and width multipliers used for each level. Used on the `vqvae_conv_block_width` and
+            `vqvae_conv_block_depth`
         vqvae_lmu (`float`, *optional*, defaults to 0.99):
-
+            Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
+            of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
         vqvae_commit (`float`, *optional*, defaults to 0.02):
-
+            Commit loss multiplier.
         vqvae_conv_block_depth (`int`, *optional*, defaults to 4):
-
+            Depth of the encoder and decoder block. If no `vqvae_multipliers` are used, this is the same for each
+            level.
         vqvae_conv_block_width (`int`, *optional*, defaults to 32):
-
+            Width of the encoder and decoder block. If no `vqvae_multipliers` are used, this is the same for each
+            level.
         vqvae_reverse_decoder_dilation (`int`, *optional*, defaults to 1):
-
+            Whether or not to reverse the dilation rate for the decoder.
     Example:
 
     ```python
@@ -219,10 +218,9 @@ class JukeboxConfig(PretrainedConfig):
 
     model_type = "jukebox"
     attribute_map = {
-        "hidden_size": "n_embd",
+        "hidden_size": "vqvae_codebook_dimension",
         "max_position_embeddings": "n_positions",
         "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
     }
 
     def __init__(
@@ -265,13 +263,11 @@ def __init__(
         lyric_enc_emb_dropout=0.0,
         lyric_enc_zero_out=False,
         lyric_enc_res_scale=False,
-        lyric_enc_pos_init=False,
         lyric_enc_n_vocab=79,
         prior_init_scale=[0.2, 1, 1],
         prior_spread=None,
         prior_zero_out=False,
         prior_res_scale=False,
-        prior_pos_init=False,
         prior_n_ctx=(6144, 8192, 8192),
         prior_latent_dim=2048,
         prior_width=[2048, 1920, 1920],
@@ -319,7 +315,6 @@ def __init__(
         self.prior_emb_dropout = prior_emb_dropout
         self.prior_zero_out = prior_zero_out
         self.prior_res_scale = prior_res_scale
-        self.prior_pos_init = prior_pos_init
         self.prior_blocks = prior_blocks
         self.prior_m_attn = prior_m_attn
         self.prior_spread = prior_spread
@@ -363,7 +358,6 @@ def __init__(
         self.lyric_enc_loss_fraction = lyric_enc_loss_fraction
         self.lyric_enc_m_attn = lyric_enc_m_attn
         self.lyric_enc_m_mlp = lyric_enc_m_mlp
-        self.lyric_enc_pos_init = lyric_enc_pos_init
         self.lyric_enc_resid_dropout = lyric_enc_resid_dropout
         self.lyric_enc_res_scale = lyric_enc_res_scale
         self.lyric_enc_spread = lyric_enc_spread
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 1e4948b79986d..93e7af3567d47 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -31,7 +31,6 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, logging
 from .configuration_jukebox import JukeboxConfig
-from .tokenization_jukebox import get_relevant_lyric_tokens
 
 
 logger = logging.get_logger(__name__)
@@ -65,6 +64,37 @@ def get_range(list):
     from torch.nn import LayerNorm as FusedLayerNorm
 
 
+def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
+    """
+    Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
+    returned. If the provided token sequence is smaller, it will be padded, othewise, only characters ranging from the
+    midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
+    the most relevant tokens (in time) for the sequence.
+
+    Args:
+        full_tokens (`List[int]`):
+            List containing the ids of the entire lyrics.
+        total_length (`int`):
+            Total expected length of the music (not all of it is generated, see duration), in samples.
+        offset (`int`):
+            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
+            account
+        duration (`int`):
+            Expected duration of the generated music, in samples. The duration has to be smaller than the total lenght,
+            which represent the overall length of the signal,
+    """
+    full_tokens = full_tokens[0]
+    if len(full_tokens) < max_n_lyric_tokens:
+        tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens)), full_tokens])
+        indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
+    else:
+        midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
+        midpoint = min(max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2)
+        tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2]
+        indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
+    return tokens.unsqueeze(dim=0), indices
+
+
 class JukeboxConv1D(nn.Module):
     def __init__(self, n_in, n_out, zero_out=False):
         super(JukeboxConv1D, self).__init__()
@@ -332,7 +362,7 @@ def _tile(self, hidden_states):
         return hidden_states
 
     def init_codebook(self, hidden_states):
-        codebook_dim = self.codebook_dim  # mu,
+        codebook_dim = self.codebook_dim
         self.init = True
         # init k_w using random vectors from hidden_states codebook_w (index w?)
         codes = self._tile(hidden_states)
@@ -558,8 +588,6 @@ def __init__(self, config):
 
         multipliers = config.vqvae_multipliers
         codebook_width = config.vqvae_emmbedding_width
-        self.width = config.vqvae_width
-        self.depth = config.vqvae_depth
 
         self.downs_t = downs_t = config.vqvae_downs_t
         self.strides_t = strides_t = config.vqvae_strides_t
@@ -1429,18 +1457,14 @@ def del_cache(self):
 
 
 class JukeboxPositionalEmbedding(nn.Module):
-    def __init__(self, input_shape, width, init_scale=1.0, pos_init=False):
+    def __init__(self, input_shape, width, init_scale=1.0):
         super().__init__()
         self.input_shape = input_shape
         self.input_dims = np.prod(input_shape)
-        self.pos_init = pos_init
         self.pos_emb = nn.Parameter(get_normal(self.input_dims, width, std=0.01 * init_scale))
 
     def forward(self):
-        if self.pos_init:
-            pos_emb = sum([self._pos_embs[i](self.pos[:, i]) for i in range(len(self.input_shape))])
-        else:
-            pos_emb = self.pos_emb
+        pos_emb = self.pos_emb
         return pos_emb
 
 
@@ -1459,7 +1483,6 @@ def __init__(
         zero_out=False,
         init_scale=1.0,
         res_scale=False,
-        pos_init=False,
         m_attn=0.25,
         m_mlp=1,
         attn_order=0,
@@ -1501,9 +1524,7 @@ def __init__(
         if not metadata_conditioning:
             self.start_token = nn.Parameter(get_normal(1, width, std=0.01 * init_scale))
 
-        self.pos_emb = JukeboxPositionalEmbedding(
-            input_shape=input_shape, width=width, init_scale=init_scale, pos_init=pos_init
-        )
+        self.pos_emb = JukeboxPositionalEmbedding(input_shape=input_shape, width=width, init_scale=init_scale)
         self.pos_emb_dropout = nn.Dropout(emb_dropout)
 
         self.transformer = JukeboxTransformer(
@@ -2109,7 +2130,6 @@ def rescale(music_tokens_shape):
             emb_dropout=config.prior_emb_dropout,
             zero_out=config.prior_zero_out,
             res_scale=config.prior_res_scale,
-            pos_init=config.prior_pos_init,
             init_scale=config.prior_init_scale[-level - 1],
             m_attn=config.prior_m_attn,
         )
@@ -2129,7 +2149,6 @@ def rescale(music_tokens_shape):
                 emb_dropout=config.lyric_enc_emb_dropout,
                 zero_out=config.lyric_enc_zero_out,
                 res_scale=config.lyric_enc_res_scale,
-                pos_init=config.lyric_enc_pos_init,
                 init_scale=config.lyric_enc_init_scale[-level - 1],
                 m_attn=config.lyric_enc_m_attn,
                 m_mlp=config.lyric_enc_m_mlp,
@@ -3003,7 +3022,7 @@ def _sample(
                     with torch.no_grad():
                         alignments = get_alignment(music_tokens, labels[-1], self.priors[-1], fp16, self.config)
                     torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")
-                    # disable saving to html, TODO should we do it
+                    # disabled saving to html, as it requires too many dependencies.
         return music_tokens
 
     @add_start_docstrings(
@@ -3018,24 +3037,15 @@ def _sample(
         Example:
 
         ```python
-        >>> from transformers import JukeboxTokenizer, JukeboxModel
-
-        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0)
-        Level:0, Cond downsample:4, Raw to tokens:8, Sample length:65536
-        Level:1, Cond downsample:4, Raw to tokens:32, Sample length:262144
-        Level:2, Cond downsample:None, Raw to tokens:128, Sample length:786432
-
-        >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
-
-        >>> lyrics = "Hey, are you awake? Can you talk to me?"
-        >>> artist = "Zac Brown Band"
-        >>> genre = "Country"
-        >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
-
-        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2)
-        >>> model.decode(music_tokens)[:, :, 30]
-        ```
-        """,
+        >>> from transformers import JukeboxTokenizer, JukeboxModel >>> model =
+        JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0) Level:0, Cond downsample:4, Raw to
+        tokens:8, Sample length:65536 Level:1, Cond downsample:4, Raw to tokens:32, Sample length:262144 Level:2, Cond
+        downsample:None, Raw to tokens:128, Sample length:786432 >>> tokenizer =
+        JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics") >>> lyrics = "Hey, are you awake? Can you talk to
+        me?" >>> artist = "Zac Brown Band" >>> genre = "Country" >>> metas = tokenizer(artist=artist, genres=genre,
+        lyrics=lyrics) >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2) >>>
+        model.decode(music_tokens)[:, :10] tensor([[-0.0006], [ 0.0009], [ 0.0005], [-0.0010], [-0.0010], [-0.0004],
+        [-0.0002], [-0.0004], [-0.0005], [ 0.0002]]```""",
     )
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch.LongTensor]:
 
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 24a8ef108460c..8a2496a53585c 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -18,13 +18,12 @@
 import json
 import os
 from json.encoder import INFINITY
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
 import regex as re
 from tokenizers import normalizers
-from transformers.testing_utils import require_torch
 from transformers.utils.generic import _is_jax, _is_numpy
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
@@ -32,10 +31,6 @@
 from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
 
 
-if TYPE_CHECKING:
-    if is_torch_available():
-        import torch
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {
@@ -68,40 +63,6 @@
 """
 
 
-@require_torch
-def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
-    """
-    Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
-    returned. If the provided token sequence is smaller, it will be padded, othewise, only characters ranging from the
-    midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
-    the most relevant tokens (in time) for the sequence.
-
-    Args:
-        full_tokens (`List[int]`):
-            List containing the ids of the entire lyrics.
-        total_length (`int`):
-            Total expected length of the music (not all of it is generated, see duration), in samples.
-        offset (`int`):
-            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
-            account
-        duration (`int`):
-            Expected duration of the generated music, in samples. The duration has to be smaller than the total lenght,
-            which represent the overall length of the signal,
-    """
-    import torch
-
-    full_tokens = full_tokens[0]
-    if len(full_tokens) < max_n_lyric_tokens:
-        tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens)), full_tokens])
-        indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
-    else:
-        midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
-        midpoint = min(max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2)
-        tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2]
-        indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
-    return tokens.unsqueeze(dim=0), indices
-
-
 class JukeboxTokenizer(PreTrainedTokenizer):
     """
     Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
@@ -467,7 +428,9 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         return (artists_file, genres_file, lyrics_file)
 
     def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
-        """Converts an index (integer) in a token (str) using the vocab.
+        """
+        Converts an index (integer) in a token (str) using the vocab.
+
         Args:
             artists_index (`int`):
                 Index of the artist in its corresponding dictionnary.

From 401b970cfc3bb4e35bbe03c233054b3ac5c58dfd Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 20 Sep 2022 08:26:19 +0000
Subject: [PATCH 119/196] nits on tokenizer doc

---
 .../models/jukebox/tokenization_jukebox.py    | 34 ++++---------------
 1 file changed, 6 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 8a2496a53585c..08d2ee4654773 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -100,9 +100,12 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     However the code does not allow that and only supports composing from various genres.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file which should contain a dictionnary where the keys are 'artist', 'genre' and
-            'lyrics' and the values are their corresponding vocabulary files.
+        artists_file (`str`):
+            Path to the vocabulary file which contains a mapping between artists and ids. The default file supports both "v2" and "v3" 
+        genres_file (`str`):
+            Path to the vocabulary file which contain a mapping between genres and ids.
+        lyrics_file (`str`):
+            Path to the vocabulary file which contains the accepted characters for the lyrics tokenization.
         version (`List[`str`], `optional`, default to ["v3", "v2", "v2"]) :
             List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
             `v2`.
@@ -173,20 +176,6 @@ def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
         """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
         The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
         the lyrics token sequence.
-
-        Args:
-            artist (`_type_`):
-                _description_
-            genre (`_type_`):
-                _description_
-            lyrics (`_type_`):
-                _description_
-            total_length (`_type_`):
-                _description_
-            offset (`_type_`):
-                _description_
-            duration (`_type_`):
-                _description_
         """
         artists_id = [self.artists_encoder.get(artist, 0) for artist in list_artists]
         for genres in range(len(list_genres)):
@@ -209,14 +198,6 @@ def _tokenize(self, lyrics):
     def tokenize(self, artist, genre, lyrics, **kwargs):
         """
         Converts three strings in a 3 sequence of tokens using the tokenizer
-
-        Args:
-            artist (`_type_`):
-                _description_
-            genre (`_type_`):
-                _description_
-            lyrics (`_type_`):
-                _description_
         """
         artist, genre, lyrics = self.prepare_for_tokenization(artist, genre, lyrics)
         lyrics = self._tokenize(lyrics)
@@ -244,9 +225,6 @@ def prepare_for_tokenization(
                 which it will tokenize. This is useful for NER or token classification.
             kwargs:
                 Keyword arguments to use for the tokenization.
-
-        Returns:
-            `Tuple[str, Union[List[str]|str], str, Dict[str, Any]]`:
         """
         for idx in range(len(self.version)):
             if self.version[idx] == "v3":

From 815790b8178fbe85ebfdc7b8fd0768ac81a8cbc1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 20 Sep 2022 08:36:04 +0000
Subject: [PATCH 120/196] nits doc builder

---
 .../models/jukebox/modeling_jukebox.py        | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 93e7af3567d47..ba9ca9a88db2a 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2932,7 +2932,7 @@ def _sample(
             fp16 (bool, *optional*): _description_. Defaults to False.
 
         Returns:
-            `__type__`: _description_
+            
 
         Example:
         ```python
@@ -3031,21 +3031,29 @@ def _sample(
         upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
         the VQ-VAE decoder to convert the music tokens to raw audio.
 
-        Args:""",
+        Args:
+        
+        """,
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
         """
+
         Example:
 
         ```python
-        >>> from transformers import JukeboxTokenizer, JukeboxModel >>> model =
-        JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0) Level:0, Cond downsample:4, Raw to
+        >>> from transformers import JukeboxTokenizer, JukeboxModel 
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0) Level:0, Cond downsample:4, Raw to
         tokens:8, Sample length:65536 Level:1, Cond downsample:4, Raw to tokens:32, Sample length:262144 Level:2, Cond
-        downsample:None, Raw to tokens:128, Sample length:786432 >>> tokenizer =
-        JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics") >>> lyrics = "Hey, are you awake? Can you talk to
-        me?" >>> artist = "Zac Brown Band" >>> genre = "Country" >>> metas = tokenizer(artist=artist, genres=genre,
-        lyrics=lyrics) >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2) >>>
-        model.decode(music_tokens)[:, :10] tensor([[-0.0006], [ 0.0009], [ 0.0005], [-0.0010], [-0.0010], [-0.0004],
-        [-0.0002], [-0.0004], [-0.0005], [ 0.0002]]```""",
+        downsample:None, Raw to tokens:128, Sample length:786432 
+        >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+        >>> lyrics = "Hey, are you awake? Can you talk tome?" 
+        >>> artist = "Zac Brown Band" 
+        >>> genre = "Country" 
+        >>> metas = tokenizer(artist=artist, genres=genre,
+        lyrics=lyrics) 
+        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2) 
+        >>>model.decode(music_tokens)[:, :10] 
+        tensor([[-0.0006], [ 0.0009], [ 0.0005], [-0.0010], [-0.0010], [-0.0004],[-0.0002], [-0.0004], [-0.0005], [ 0.0002]]
+        ```""",
     )
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch.LongTensor]:
 

From 4e10b0168e761ab1f5e0252637b4872bddc6e2ae Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 20 Sep 2022 08:43:23 +0000
Subject: [PATCH 121/196] fix example

---
 .../models/jukebox/modeling_jukebox.py        | 41 ++++++++++---------
 .../models/jukebox/tokenization_jukebox.py    |  3 +-
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index ba9ca9a88db2a..60db9f3c31253 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -29,7 +29,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging
+from ...utils import add_end_docstrings, add_start_docstrings, logging
 from .configuration_jukebox import JukeboxConfig
 
 
@@ -2932,7 +2932,7 @@ def _sample(
             fp16 (bool, *optional*): _description_. Defaults to False.
 
         Returns:
-            
+
 
         Example:
         ```python
@@ -3031,31 +3031,32 @@ def _sample(
         upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
         the VQ-VAE decoder to convert the music tokens to raw audio.
 
-        Args:
-        
-        """,
+        Args:""",
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
+    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch.LongTensor]:
         """
-
         Example:
 
         ```python
-        >>> from transformers import JukeboxTokenizer, JukeboxModel 
-        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0) Level:0, Cond downsample:4, Raw to
-        tokens:8, Sample length:65536 Level:1, Cond downsample:4, Raw to tokens:32, Sample length:262144 Level:2, Cond
-        downsample:None, Raw to tokens:128, Sample length:786432 
+        >>> from transformers import JukeboxTokenizer, JukeboxModel
+
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0)
+        Level:0, Cond downsample:4, Raw to tokens:8, Sample length:65536
+        Level:1, Cond downsample:4, Raw to tokens:32, Sample length:262144
+        Level:2, Cond downsample:None, Raw to tokens:128, Sample length:786432
+
         >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
-        >>> lyrics = "Hey, are you awake? Can you talk tome?" 
-        >>> artist = "Zac Brown Band" 
-        >>> genre = "Country" 
-        >>> metas = tokenizer(artist=artist, genres=genre,
-        lyrics=lyrics) 
-        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2) 
-        >>>model.decode(music_tokens)[:, :10] 
+        >>> lyrics = "Hey, are you awake? Can you talk to me?"
+        >>> artist = "Zac Brown Band"
+        >>> genre = "Country"
+        >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
+
+        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2)
+
+        >>> model.decode(music_tokens)[:, :10]
         tensor([[-0.0006], [ 0.0009], [ 0.0005], [-0.0010], [-0.0010], [-0.0004],[-0.0002], [-0.0004], [-0.0005], [ 0.0002]]
-        ```""",
-    )
-    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch.LongTensor]:
+        ```"""
 
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         music_tokens = [
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 08d2ee4654773..4750482dc214f 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -101,7 +101,8 @@ class JukeboxTokenizer(PreTrainedTokenizer):
 
     Args:
         artists_file (`str`):
-            Path to the vocabulary file which contains a mapping between artists and ids. The default file supports both "v2" and "v3" 
+            Path to the vocabulary file which contains a mapping between artists and ids. The default file supports
+            both "v2" and "v3"
         genres_file (`str`):
             Path to the vocabulary file which contain a mapping between genres and ids.
         lyrics_file (`str`):

From a6372ce2fdef30bf592983fbc07f8860cc76096b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 20 Sep 2022 08:51:28 +0000
Subject: [PATCH 122/196] update modeling doc

---
 .../models/jukebox/modeling_jukebox.py        | 47 ++++++++++++-------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 60db9f3c31253..eeab9d4526258 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2915,23 +2915,36 @@ def _sample(
         the generated raw audio at each step.
 
         Args:
-            music_tokens (`__type__`): _description_
-            labels (`__type__`): _description_
-            sample_levels (`__type__`): _description_
-            metas (`__type__`, *optional*): _description_. Defaults to None.
-            chunk_size (int, *optional*): _description_. Defaults to 32.
-            sampling_temperature (float, *optional*): _description_. Defaults to 0.98.
-            lower_batch_size (int, *optional*): _description_. Defaults to 16.
-            max_batch_size (int, *optional*): _description_. Defaults to 16.
-            sample_length_in_seconds (int, *optional*): _description_. Defaults to 24.
-            alignments (`__type__`, *optional*): _description_. Defaults to None.
-            sample_tokens (`__type__`, *optional*): _description_. Defaults to None.
-            offset (int, *optional*): _description_. Defaults to 0.
-            save_results (bool, *optional*): _description_. Defaults to True.
-            sample_length (`__type__`, *optional*): _description_. Defaults to None.
-            fp16 (bool, *optional*): _description_. Defaults to False.
-
-        Returns:
+            music_tokens (`__type__`): 
+                _description_
+            labels (`__type__`): 
+                _description_
+            sample_levels (`__type__`): 
+                _description_
+            metas (`__type__`, *optional*): 
+                _description_. Defaults to None.
+            chunk_size (int, *optional*):
+                _description_. Defaults to 32.
+            sampling_temperature (float, *optional*): 
+                _description_. Defaults to 0.98.
+            lower_batch_size (int, *optional*):
+                _description_. Defaults to 16.
+            max_batch_size (int, *optional*): 
+                _description_. Defaults to 16.
+            sample_length_in_seconds (int, *optional*): 
+                _description_. Defaults to 24.
+            alignments (`__type__`, *optional*): 
+                _description_. Defaults to None.
+            sample_tokens (`__type__`, *optional*): 
+                _description_. Defaults to None.
+            offset (int, *optional*): 
+                _description_. Defaults to 0.
+            save_results (bool, *optional*): 
+                _description_. Defaults to True.
+            sample_length (`__type__`, *optional*): 
+                _description_. Defaults to None.
+            fp16 (bool, *optional*): 
+                _description_. Defaults to False.
 
 
         Example:

From a17cdf75e23e69bdd591e9fa01fa954622ab97b3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 20 Sep 2022 12:44:12 +0000
Subject: [PATCH 123/196] update doc

---
 docs/source/en/model_doc/jukebox.mdx          |  2 +-
 .../models/jukebox/modeling_jukebox.py        | 84 ++++++++++---------
 .../models/jukebox/tokenization_jukebox.py    |  2 +-
 3 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 1a83d1287e68c..25fe3a3e5967d 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -26,7 +26,7 @@ As shown on the following figure, Jukebox is made of 3 `priors` which are decode
 An encoder model is used on the lyrics: the first (also called `top_prior`) prior's decoder attends to the last hidden states extracted from the `lyric_encoder`. Each prior is linked to the previous by an `AudioConditionner` module which takes care of upsampling the generated dequantised states to the correct `raw_to_token` resolution of the current prior.
 The metadatas such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. 
 
-![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/2a75e9ab330ef27ae6565fcf28b129e033ff9bed/Jukebox.svg)
+![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
 
 Tips:
 - This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior.
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index eeab9d4526258..1fad717528324 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -29,7 +29,7 @@
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_end_docstrings, add_start_docstrings, logging
+from ...utils import add_start_docstrings, logging
 from .configuration_jukebox import JukeboxConfig
 
 
@@ -1559,7 +1559,6 @@ def __init__(
             self.share_embed_tokens_fc_proj_out = True
 
         if not only_encode:
-            # TODO rename fc_pro_out to LM head an probably use HF's linking trick
             self.fc_proj_out = nn.Linear(width, embed_dim, bias=False)
             if self.share_embed_tokens_fc_proj_out:
                 self.fc_proj_out.weight = self.embed_tokens.weight
@@ -1683,7 +1682,7 @@ def sample(
     ):
         if sample_tokens is None:
             sample_tokens = self.input_dims
-        N, _ = n_samples, self.input_dims
+        N = n_samples
 
         if not self.audio_conditioning:
             audio_conditioning = torch.zeros((N, 1, self.width), dtype=torch.float).to(
@@ -1757,7 +1756,7 @@ def primed_sample(
         sampled_audio = torch.split(hidden_states, 1, dim=1)
         sampled_audio = list(sampled_audio)
 
-        N, _ = n_samples, self.input_dims
+        N = n_samples
 
         if not self.audio_conditioning:
             audio_conditioning = torch.zeros((N, 1, self.width), dtype=torch.float).to(hidden_states.device)
@@ -2344,7 +2343,7 @@ def prior_preprocess(self, tokens, conds):
             tokens[i] = (tokens[i] + int(self.prior_embed_dim_shift[i])).view(N, -1)
 
         for i in range(len(conds)):
-            cond, _, dims = conds[i], self.prior_shapes[i], self.prior_dims[i]
+            cond, dims = conds[i], self.prior_dims[i]
             if cond is None:
                 conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=tokens[0].device)
 
@@ -2903,7 +2902,7 @@ def _sample(
         lower_batch_size=16,
         max_batch_size=16,
         sample_length_in_seconds=24,
-        alignments=None,
+        compute_alignments=False,
         sample_tokens=None,
         offset=0,
         save_results=True,
@@ -2915,36 +2914,42 @@ def _sample(
         the generated raw audio at each step.
 
         Args:
-            music_tokens (`__type__`): 
-                _description_
-            labels (`__type__`): 
-                _description_
-            sample_levels (`__type__`): 
-                _description_
-            metas (`__type__`, *optional*): 
-                _description_. Defaults to None.
-            chunk_size (int, *optional*):
-                _description_. Defaults to 32.
-            sampling_temperature (float, *optional*): 
-                _description_. Defaults to 0.98.
-            lower_batch_size (int, *optional*):
-                _description_. Defaults to 16.
-            max_batch_size (int, *optional*): 
-                _description_. Defaults to 16.
-            sample_length_in_seconds (int, *optional*): 
-                _description_. Defaults to 24.
-            alignments (`__type__`, *optional*): 
-                _description_. Defaults to None.
-            sample_tokens (`__type__`, *optional*): 
-                _description_. Defaults to None.
-            offset (int, *optional*): 
-                _description_. Defaults to 0.
-            save_results (bool, *optional*): 
-                _description_. Defaults to True.
-            sample_length (`__type__`, *optional*): 
-                _description_. Defaults to None.
-            fp16 (bool, *optional*): 
-                _description_. Defaults to False.
+           music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
+                A sequence of music tokens which will be used as context to continue the sampling process. Should have
+                `self.levels` tensors, each corresponding to the generation at a certain level.
+            labels (`List[torch.Tensor]`):
+                Raw list of tokens. Should be the same length as `self.levels`, the number of priors or the length of
+                `sample_levels`.
+            sample_levels (`List[int]`):
+                List of the desired levels at which the sampling will be done. A level is equivalent to the index of
+                the prior in the list of priors
+            metas (`List[Any]`, *optional*, defaults to None):
+                Metadatas used to generate the `labels`
+            chunk_size (`int`, *optional*, defaults to 32):
+                Size of a chunk of audio, used to fill up the memory in chuncks to prevent OOM erros. Bigger chunks means faster memory filling but more consumption.
+            sampling_temperature (`float`, *optional*, defaults to 0.98):
+                Temperature used to ajust the randomness of the sampling.
+            lower_batch_size (`int`, *optional*, defaults to 16):
+                Maximum batch size for the lower level priors
+            max_batch_size (`int`, *optional*, defaults to 16):
+                Maximum batch size for the top level priors
+            sample_length_in_seconds (`int`, *optional*, defaults to 24):
+                Desired lenght of the generation in seconds
+            compute_alignments (`bool`, *optional*, defaults to False):
+                Whether or not to compute the alignment between the lyrics and the audio using the top_prior
+            sample_tokens (`int`, *optional*, defaults to None):
+                Precise number of tokens that should be sampled at each level. This is mostly useful for running dummy
+                experiments
+            offset (`int`, *optional*, defaults to 0):
+                Audio offset used as conditioning, corresponds to the starting sample in the music. If the offset is
+                greater than 0, the lyrics will be shifted take that intoaccount
+            save_results (`bool`, *optional*, defaults to True):
+                Whether or not to save the intermediate results. If `True`, will generate a folder named with the start
+                time.
+            sample_length (`int`, *optional*, defaults to None):
+                Desired lenght of the generation in samples.
+            fp16 (`bool`, *optional*, defaults to False):
+                Whether or not to cast the hidden states to float16 in the attention layer. Defaults to False.
 
 
         Example:
@@ -2957,9 +2962,8 @@ def _sample(
 
         >>> zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         >>> zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
-        [1864,
-        1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534, 1000, 1445, 1105, 1130, 967, 515, 1434, 1620,
-        534, 1495, 283, 1445, 333, 1307, 539, 1631, 1528, 375, 1434, 673, 627, 710, 778, 1883, 1405, 1276, 1455, 1228]
+        [1864,1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534, 1000, 1445, 1105, 1130, 967, 515, 1434, 1620,
+         534, 1495, 283, 1445, 333, 1307, 539, 1631, 1528, 375, 1434, 673, 627, 710, 778, 1883, 1405, 1276, 1455, 1228]
         ```"""
 
         top_prior = self.priors[-1]
@@ -3029,7 +3033,7 @@ def _sample(
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
                 save_wav(logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate)
-                if alignments is None and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
+                if compute_alignments and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
                     gc.collect()
                     torch.cuda.empty_cache()
                     with torch.no_grad():
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 4750482dc214f..74f7e5ba019f0 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -71,7 +71,7 @@ class JukeboxTokenizer(PreTrainedTokenizer):
         - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
         vocabulary.
 
-    This tokenizer is straight forward and does not require trainingg. It should be able to process a different number of inputs:
+    This tokenizer does not require training. It should be able to process a different number of inputs:
     as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
 
     Depending on the number of genres on which the model should be conditioned (`n_genres`).

From a0d51f1487ed00b9c2674a906dc1925a0ded99c1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 20 Sep 2022 13:24:04 +0000
Subject: [PATCH 124/196] Nit

---
 src/transformers/models/jukebox/modeling_jukebox.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 1fad717528324..268acaa2bca8f 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -35,10 +35,6 @@
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "ArthurZ/jukebox-dummy"
-_CONFIG_FOR_DOC = "JukeboxConfig"
-_TOKENIZER_FOR_DOC = "JukeboxTokenizer"
-
 JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "openai/jukebox-1b-lyrics",
     "openai/jukebox-5b-lyrics",
@@ -2926,7 +2922,8 @@ def _sample(
             metas (`List[Any]`, *optional*, defaults to None):
                 Metadatas used to generate the `labels`
             chunk_size (`int`, *optional*, defaults to 32):
-                Size of a chunk of audio, used to fill up the memory in chuncks to prevent OOM erros. Bigger chunks means faster memory filling but more consumption.
+                Size of a chunk of audio, used to fill up the memory in chuncks to prevent OOM erros. Bigger chunks
+                means faster memory filling but more consumption.
             sampling_temperature (`float`, *optional*, defaults to 0.98):
                 Temperature used to ajust the randomness of the sampling.
             lower_batch_size (`int`, *optional*, defaults to 16):

From 677b27f39bd408914cae9271246166ab9df10902 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 20 Sep 2022 13:51:47 +0000
Subject: [PATCH 125/196] remove todos

---
 src/transformers/models/jukebox/modeling_jukebox.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 268acaa2bca8f..6de601551aa4b 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1727,7 +1727,6 @@ def split_chunks(self, length, chunk_size):
         chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
         return chunk_sizes
 
-    # FIXME TODO last function needing renaming
     def primed_sample(
         self,
         n_samples,
@@ -1773,7 +1772,6 @@ def primed_sample(
             for current_chunk_size in get_range(chunk_sizes):
                 sampled_audio_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
-                    # TODO rename x_prime, con_prime
                     x_prime, cond_prime = self.get_emb(
                         sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning
                     )
@@ -1781,7 +1779,6 @@ def primed_sample(
                     sampled_audio_prime.append(x_prime)
                     conds_prime.append(cond_prime)
                 start = start + current_chunk_size
-                # TODO rename x_prime, con_prime
                 x_prime, cond_prime = torch.cat(sampled_audio_prime, dim=1), torch.cat(conds_prime, dim=1)
                 del sampled_audio_prime
                 del conds_prime

From ebad2d18b63581a75fb2ddb6d91857334b211416 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 21 Sep 2022 15:34:40 +0200
Subject: [PATCH 126/196] Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/model_doc/jukebox.mdx                    | 6 +++---
 src/transformers/models/jukebox/tokenization_jukebox.py | 2 +-
 tests/models/jukebox/test_modeling_jukebox.py           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 25fe3a3e5967d..df4941c8eb13f 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -23,13 +23,13 @@ The abstract from the paper is the following:
 *We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
 
 As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only model. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length. 
-An encoder model is used on the lyrics: the first (also called `top_prior`) prior's decoder attends to the last hidden states extracted from the `lyric_encoder`. Each prior is linked to the previous by an `AudioConditionner` module which takes care of upsampling the generated dequantised states to the correct `raw_to_token` resolution of the current prior.
-The metadatas such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. 
+First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditionner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. 
+The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. 
 
 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
 
 Tips:
-- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior.
+- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU.
 - Primed sampling requires more memory than ancestral sampling and should be used with `fp16` set to `True`. 
 
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 74f7e5ba019f0..f2f7b4b6df919 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -52,7 +52,7 @@
 }
 
 PRETRAINED_LYRIC_TOKENS_SIZES = {
-    "jukebox": 512,  # corresonds to the dummy-model ?
+    "jukebox": 512, 
 }
 
 """" batch_outputs = BatchEncoding(
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index f857daf0b70b2..d1df86a1f92fc 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import JukeboxModel, JukeboxTokenizer  # ,JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers import JukeboxModel, JukeboxTokenizer
 
 
 @require_torch

From d2f6eedb217d28130e3edf5310d242c7b2008660 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 21 Sep 2022 15:36:06 +0200
Subject: [PATCH 127/196] Update docs/source/en/model_doc/jukebox.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/model_doc/jukebox.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index df4941c8eb13f..1025ffda81e05 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -22,7 +22,7 @@ The abstract from the paper is the following:
 
 *We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
 
-As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only model. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length. 
+As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length. 
 First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditionner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. 
 The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. 
 

From 23feb31e9c1090c31b9036ed07791c8580dd70c2 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 21 Sep 2022 15:49:56 +0200
Subject: [PATCH 128/196] Update src/transformers/models/jukebox/__init__.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/jukebox/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index c243a7f3b67be..f4fd73cb9c048 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From ea2c6f3534bcb263bb71bbccb439d2c0d101605f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 21 Sep 2022 15:45:31 +0000
Subject: [PATCH 129/196] update based on review

---
 docs/source/en/model_doc/jukebox.mdx          |  2 +-
 .../models/jukebox/tokenization_jukebox.py    | 25 +++++++++++--------
 tests/models/jukebox/test_modeling_jukebox.py |  5 ----
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 1025ffda81e05..556588eda19ec 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -30,7 +30,7 @@ The metadata such as *artist, genre and timing* are passed to each prior, in the
 
 Tips:
 - This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU.
-- Primed sampling requires more memory than ancestral sampling and should be used with `fp16` set to `True`. 
+- Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`. 
 
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
 The original code can be found [here](https://github.com/openai/jukebox).
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index f2f7b4b6df919..0e33272179469 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -17,13 +17,13 @@
 
 import json
 import os
+import unicodedata
 from json.encoder import INFINITY
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
 import regex as re
-from tokenizers import normalizers
 from transformers.utils.generic import _is_jax, _is_numpy
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
@@ -52,16 +52,9 @@
 }
 
 PRETRAINED_LYRIC_TOKENS_SIZES = {
-    "jukebox": 512, 
+    "jukebox": 512,
 }
 
-"""" batch_outputs = BatchEncoding(
-    encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
-)
-
-
-"""
-
 
 class JukeboxTokenizer(PreTrainedTokenizer):
     """
@@ -249,12 +242,22 @@ def prepare_for_tokenization(
         else:
             self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
 
-        normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.StripAccents()])
-        lyrics = normalizer.normalize_str(lyrics)
+        lyrics = self._run_strip_accents(lyrics)
         lyrics = lyrics.replace("\\", "\n")
         lyrics = [], [], self.out_of_vocab.sub("", lyrics)
         return artists, genres, lyrics
 
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
     def _normalize(self, text: str) -> str:
         """Normalizes the input text. This process is for the genres and the artit
 
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index d1df86a1f92fc..af61a6307064e 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -333,8 +333,3 @@ def test_slow_sampling(self):
         set_seed(0)
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
-
-
-if __name__ == "__main__":
-    tester = Jukebox1bModelTester()
-    tester.test_vqvae()

From 36c3704cfeb096ddf922ffda06e9c386db9a7e86 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 21 Sep 2022 15:47:47 +0000
Subject: [PATCH 130/196] partial update

---
 src/transformers/models/jukebox/modeling_jukebox.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 6de601551aa4b..7712fc72fc7dc 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -57,7 +57,7 @@ def get_range(list):
 
     print("Using apex FusedLayerNorm")
 except ImportError:
-    from torch.nn import LayerNorm as FusedLayerNorm
+from torch.nn import LayerNorm as FusedLayerNorm
 
 
 def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
@@ -156,11 +156,12 @@ def _get_depth(depth):
 
         blocks = []
         for depth in range(n_depth):
+            block_depth = depth if dilation_cycle is None else depth % dilation_cycle
             blocks.append(
                 JukeboxResConv1DBlock(
                     n_in,
                     int(m_conv * n_in),
-                    dilation=dilation_growth_rate ** _get_depth(depth),
+                    dilation=dilation_growth_rate**block_depth,
                     zero_out=zero_out,
                     res_scale=1.0 if not res_scale else 1.0 / math.sqrt(n_depth),
                 )
@@ -193,7 +194,8 @@ def __init__(
     ):
         super().__init__()
         blocks = []
-        filter_t, pad_t = stride_t * 2, stride_t // 2
+        filter_t = stride_t * 2
+        pad_t = stride_t // 2
         if down_t > 0:
             for i in range(down_t):
                 blocks.append(nn.Conv1d(input_emb_width if i == 0 else width, width, filter_t, stride_t, pad_t))
@@ -591,7 +593,8 @@ def __init__(self, config):
         self.commit = config.vqvae_commit
 
         self.sample_length = input_shape[0]
-        x_shape, x_channels = input_shape[:-1], input_shape[-1]
+        x_shape = input_shape[:-1]
+        x_channels = input_shape[-1]
         self.x_shape = x_shape
 
         self.downsamples = [stride**down for stride, down in zip(strides_t, downs_t)]

From 65c067f26319fb5bba618cce95da621930ee2736 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 21 Sep 2022 16:49:42 +0000
Subject: [PATCH 131/196] update

---
 src/transformers/models/jukebox/modeling_jukebox.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 7712fc72fc7dc..ed696ba5ee0a1 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -51,12 +51,6 @@ def get_range(list):
     )
 
 
-# Import FusedLayerNorm if we have apex, otherwise use regular LayerNorm
-try:
-    from apex.normalization import FusedLayerNorm
-
-    print("Using apex FusedLayerNorm")
-except ImportError:
 from torch.nn import LayerNorm as FusedLayerNorm
 
 
@@ -148,12 +142,6 @@ def __init__(
     ):
         super().__init__()
 
-        def _get_depth(depth):
-            if dilation_cycle is None:
-                return depth
-            else:
-                return depth % dilation_cycle
-
         blocks = []
         for depth in range(n_depth):
             block_depth = depth if dilation_cycle is None else depth % dilation_cycle

From d16da730b66f8bde1b12bc33dbaa4f7c04d56ca0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 14:43:45 +0000
Subject: [PATCH 132/196] comment out empyt_cache

---
 .../models/jukebox/modeling_jukebox.py        | 32 +++++++++----------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index ed696ba5ee0a1..e2ee16c2a0944 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -25,6 +25,7 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
+from torch.nn import LayerNorm as FusedLayerNorm
 from tqdm import tqdm
 
 from ...activations import ACT2FN
@@ -51,9 +52,6 @@ def get_range(list):
     )
 
 
-from torch.nn import LayerNorm as FusedLayerNorm
-
-
 def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
     """
     Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
@@ -1790,8 +1788,8 @@ def primed_sample(
                 x_prime = self.fc_proj_out(x_prime)  # Predictions
                 preds.append(x_prime)
 
-            gc.collect()
-            torch.cuda.empty_cache()
+            # gc.collect()
+            # torch.cuda.empty_cache()
             hidden_states = sampled_audio[-1]
 
             for sample_t in get_range(range(len(sampled_audio), sample_tokens)):
@@ -2644,8 +2642,8 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
     indices_hops = {}
     # prior.to(tokens.device)
     prior.to("cuda")
-    gc.collect()
-    torch.cuda.empty_cache()
+    # gc.collect()
+    # torch.cuda.empty_cache()
     for start in get_range(get_starts(total_length, n_ctx, hop_length)):
         end = start + n_ctx
         # set metadata offset, sample_length and lyrics tokens
@@ -2672,8 +2670,8 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
         indices_hops[start] = indices_hop
         alignment_hops[start] = alignment_hop
     prior.cpu()
-    gc.collect()
-    torch.cuda.empty_cache()
+    # gc.collect()
+    # torch.cuda.empty_cache()
 
     # Combine attn for each hop into attn for full range
     # Use indices to place them into correct place for corresponding source tokens
@@ -2832,8 +2830,8 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         # set metadata offset, sample_length and lyrics tokens
         metadata = prior.get_metadata(labels, start, self.total_length, offset)
 
-        gc.collect()
-        torch.cuda.empty_cache()
+        # gc.collect()
+        # torch.cuda.empty_cache()
         max_batch_size = sampling_kwargs["max_batch_size"]
         del sampling_kwargs["max_batch_size"]
 
@@ -2991,8 +2989,8 @@ def _sample(
 
             # from the actual generated length
             self.priors[level].to(music_tokens[level].device).eval()
-            gc.collect()
-            torch.cuda.empty_cache()
+            # gc.collect()
+            # torch.cuda.empty_cache()
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
             # self.priors[level].total_length = total_length // self.priors[level].raw_to_tokens
             total_token_to_sample = total_length // self.priors[level].raw_to_tokens
@@ -3003,8 +3001,8 @@ def _sample(
             )
 
             self.priors[level].to("cpu")
-            gc.collect()
-            torch.cuda.empty_cache()
+            # gc.collect()
+            # torch.cuda.empty_cache()
             self.vqvae.to(music_tokens[level].device)
             # Decode sample
             with torch.no_grad():
@@ -3019,8 +3017,8 @@ def _sample(
                     os.makedirs(logdir)
                 save_wav(logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate)
                 if compute_alignments and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
-                    gc.collect()
-                    torch.cuda.empty_cache()
+                    # gc.collect()
+                    # torch.cuda.empty_cache()
                     with torch.no_grad():
                         alignments = get_alignment(music_tokens, labels[-1], self.priors[-1], fp16, self.config)
                     torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")

From 3e745eebb758b8af65f2fdb9e142685cd438b462 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 18 Oct 2022 16:30:00 +0000
Subject: [PATCH 133/196] update based on review

---
 .../models/jukebox/modeling_jukebox.py        | 399 +++++++-----------
 1 file changed, 159 insertions(+), 240 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index e2ee16c2a0944..831114ba1e28b 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -26,7 +26,7 @@
 import torch.nn.functional as F
 from torch import nn
 from torch.nn import LayerNorm as FusedLayerNorm
-from tqdm import tqdm
+from ...utils.logging import tqdm
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
@@ -43,13 +43,39 @@
 ]
 
 
-def get_range(list):
-    return tqdm(
-        list,
-        leave=True,
-        file=sys.stdout,
-        bar_format="{n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]",
-    )
+
+
+def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (vocabulary size)
+        top_k >0: keep only top key tokens with highest probability (top-k filtering).
+        top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+    """
+    logits = logits.clone()
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    assert (top_k == 0) or (top_p == 0.0)
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k, dim=-1)[0][..., -1:]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        indices_to_remove = torch.zeros_like(logits, dtype=torch.uint8).scatter_(
+            dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+        )
+        logits[indices_to_remove] = filter_value
+    return logits
 
 
 def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
@@ -83,6 +109,31 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
     return tokens.unsqueeze(dim=0), indices
 
 
+def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t):
+    # returns a mask of shape 1 x 1 x query_length x key_value_length or None if masking is not needed.
+    if mask is None or query_length == 1:
+        return None
+    offset = sample_t - query_length if sample else max(key_value_length - query_length, 0)
+    if mask == "autoregressive":
+        # Masked dense
+        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
+    elif mask == "summary":
+        # Masked summary
+        mask = (
+            torch.nn.functional.pad(
+                torch.ones(query_length, query_length, device=device)
+                .tril()
+                .view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :],
+                (0, 0, 1, 0),
+                value=1,
+            )
+            .contiguous()
+            .view(query_length, key_value_length)
+        )
+    elif mask == "prime":
+        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
+    return mask.view(1, 1, query_length, key_value_length)
+
 class JukeboxConv1D(nn.Module):
     def __init__(self, n_in, n_out, zero_out=False):
         super(JukeboxConv1D, self).__init__()
@@ -327,10 +378,7 @@ def __init__(self, codebook_dim, codebook_width, mu):
         self.codebook_dim = codebook_dim
         self.codebook_width = codebook_width
         self.mu = mu
-        self.reset_codebook()
         self.threshold = 1.0
-
-    def reset_codebook(self):
         self.init = False
         self.codebook_sum = None
         self.codebook_elem = None
@@ -360,7 +408,7 @@ def update_codebook(self, hidden_states, latent_states):
             # Calculate new centres
             latent_states_onehot = torch.zeros(
                 codebook_dim, hidden_states.shape[0], device=hidden_states.device
-            )  # codebook_dim, N * L
+            )  # codebook_dim, batch_size * L
             latent_states_onehot.scatter_(0, latent_states.view(1, hidden_states.shape[0]), 1)
 
             _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)  # codebook_dim, w
@@ -390,7 +438,7 @@ def update_codebook(self, hidden_states, latent_states):
     def preprocess(self, hidden_states):
         # NCT -> NTC -> [NT, C]
         hidden_states = hidden_states.permute(0, 2, 1).contiguous()
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # x_en = (N *L, w), k_j = (w, codebook_dim)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # x_en = (batch_size *L, w), k_j = (w, codebook_dim)
 
         if hidden_states.shape[-1] == self.codebook_width:
             prenorm = torch.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(np.prod(hidden_states.shape))
@@ -407,9 +455,9 @@ def preprocess(self, hidden_states):
 
     def postprocess(self, latent_states, dequantised_states, x_shape):
         # [NT, C] -> NTC -> NCT
-        N, T = x_shape
-        dequantised_states = dequantised_states.view(N, T, -1).permute(0, 2, 1).contiguous()
-        latent_states = latent_states.view(N, T)
+        batch_size, T = x_shape
+        dequantised_states = dequantised_states.view(batch_size, T, -1).permute(0, 2, 1).contiguous()
+        latent_states = latent_states.view(batch_size, T)
         return latent_states, dequantised_states
 
     def quantise(self, latent_states):
@@ -419,7 +467,7 @@ def quantise(self, latent_states):
             torch.sum(latent_states**2, dim=-1, keepdim=True)
             - 2 * torch.matmul(latent_states, codebook_weights)
             + torch.sum(codebook_weights**2, dim=0, keepdim=True)
-        )  # (N *L, b)
+        )  # (batch_size *L, b)
         min_distance, music_tokens = torch.min(distance, dim=-1)
         fit = torch.mean(min_distance)
         return music_tokens, fit
@@ -627,16 +675,6 @@ def decoder(level):
 
         self.bottleneck = JukeboxBottleneck(codebook_dim, codebook_width, config.vqvae_lmu, levels)
 
-    def preprocess(self, raw_audio):
-        # x: NTC [-1,1] -> NCT [-1,1]
-        raw_audio = raw_audio.permute(0, 2, 1).float()
-        return raw_audio
-
-    def postprocess(self, dequantised_states):
-        # x: NTC [-1,1] <- NCT [-1,1]
-        dequantised_states = dequantised_states.permute(0, 2, 1)
-        return dequantised_states
-
     def _decode(self, music_tokens, start_level=0, end_level=None):
         # Decode
         if end_level is None:
@@ -645,7 +683,7 @@ def _decode(self, music_tokens, start_level=0, end_level=None):
         # Use only lowest level
         decoder, dequantised_state = self.decoders[start_level], latent_states[0:1]
         dequantised_state = decoder(dequantised_state, all_levels=False)
-        dequantised_state = self.postprocess(dequantised_state)
+        dequantised_state = dequantised_state.permute(0, 2, 1)
         return dequantised_state
 
     def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1) -> torch.Tensor:
@@ -675,7 +713,7 @@ def _encode(self, raw_audio, start_level=0, end_level=None):
         # Encode
         if end_level is None:
             end_level = self.levels
-        input_audio = self.preprocess(raw_audio)
+        input_audio = raw_audio.permute(0, 2, 1).float()
         latent_states = []
         for level in range(self.levels):
             encoder = self.encoders[level]
@@ -732,14 +770,15 @@ def forward(self, raw_audio):
 
         Example:
         ```python
-        >>> model = JukeboxVQVAE.from_pretrained(self.model_id).eval()
-
+        >>> from transformers import JukeboxVQVAE, set_seed
+        >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
+        >>> set_seed(0)
         >>> zs = [torch.random(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        >>> zs = model(zs)
+        >>> audio = model.decode(zs)
         ```"""
 
         # Encode/Decode
-        input_audio = self.preprocess(raw_audio)
+        input_audio = raw_audio.permute(0, 2, 1).float()
         latent_states = []
         for level in range(self.levels):
             encoder = self.encoders[level]
@@ -751,15 +790,12 @@ def forward(self, raw_audio):
         for level in range(self.levels):
             decoder = self.decoders[level]
             dequantised_state = decoder(music_tokens[level : level + 1], all_levels=False)
-            dequantised_state.append(dequantised_state)
-
-        for level in reversed(range(self.levels)):
-            dequantised_state = self.postprocess(dequantised_states[level])
+            dequantised_states.append(dequantised_state.permute(0, 2, 1))
 
         commit_loss = sum(commit_losses)
         loss = self.commit * commit_loss
 
-        return dequantised_state, loss
+        return dequantised_states, loss
 
 
 class JukeboxMLP(nn.Module):
@@ -792,30 +828,6 @@ def forward(self, input):
             return super(JukeboxLayerNorm, self).forward(input).type_as(input)
 
 
-def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t):
-    # returns a mask of shape 1 x 1 x query_length x key_value_length or None if masking is not needed.
-    if mask is None or query_length == 1:
-        return None
-    offset = sample_t - query_length if sample else max(key_value_length - query_length, 0)
-    if mask == "autoregressive":
-        # Masked dense
-        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
-    elif mask == "summary":
-        # Masked summary
-        mask = (
-            torch.nn.functional.pad(
-                torch.ones(query_length, query_length, device=device)
-                .tril()
-                .view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :],
-                (0, 0, 1, 0),
-                value=1,
-            )
-            .contiguous()
-            .view(query_length, key_value_length)
-        )
-    elif mask == "prime":
-        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
-    return mask.view(1, 1, query_length, key_value_length)
 
 
 class JukeboxAttention(nn.Module):
@@ -856,11 +868,11 @@ def __init__(
         # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
         self.attn_func = attn_func
         self.qkv, self.attn, self.attn_mask = {
-            0: (self.factored_qkv, self.dense_attn, "autoregressive"),  # Attend to all positions
-            1: (self.factored_qkv, self.block_attn, "autoregressive"),  # Attend to your block
-            2: (self.factored_qkv, self.transpose_block_attn, "autoregressive"),  # Attend to transpose block
-            3: (self.factored_qkv, self.prev_block_attn, None),  # Attend to previous block
-            4: (self.factored_qkv, self.summary_attn, "summary"),  # Attend to last position of each block
+            0: (self.factored_qkv, self.dense_attn, "autoregressive"), # Attend to all positions
+            1: (self.factored_qkv, self.block_attn, "autoregressive"), # Attend to your block
+            2: (self.factored_qkv, self.transpose_block_attn, "autoregressive"), # Attend to transpose block
+            3: (self.factored_qkv, self.prev_block_attn, None), # Attend to previous block
+            4: (self.factored_qkv, self.summary_attn, "summary"), # Attend to last position of each block
             5: (self.factored_qkv, self.summary_spread_attn, "summary"),
             6: (self.decode_qkv, self.decode_attn, None),
             7: (self.prime_qkv, self.prime_attn, "prime"),
@@ -1349,23 +1361,23 @@ def __init__(
 
         # Orders of attn_func
         attn_func = {
-            0: lambda d: 0,  # Complete dense attn
-            1: lambda d: [1, 2][d % 2],  # Alternate row and column attn
-            2: lambda d: [1, 2, 3][d % 3],  # Alternate row, column and previous row attn
-            3: lambda d: [1, 4][d % 2],  # Alternate row and last column
-            4: lambda d: [1, 5][d % 2],  # Alternate row and last k columns
-            5: lambda d: [1, 4, 1, 1][d % 4],  # Alternate row, last column, row, row
+            0: lambda d: 0, # Complete dense attn
+            1: lambda d: [1, 2][d % 2], # Alternate row and column attn
+            2: lambda d: [1, 2, 3][d % 3], # Alternate row, column and previous row attn
+            3: lambda d: [1, 4][d % 2], # Alternate row and last column
+            4: lambda d: [1, 5][d % 2], # Alternate row and last k columns
+            5: lambda d: [1, 4, 1, 1][d % 4], # Alternate row, last column, row, row
             6: lambda d: [1, 2, 3, 6][d % 4],
             7: lambda d: [*[1, 2, 3] * 5, 6][d % 16],
-            8: lambda d: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][d % 10],  # Used by separated_enc_dec model with lyrics
+            8: lambda d: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][d % 10], # Used by separated_enc_dec model with lyrics
             9: lambda d: [1, 2, 3, 0][d % 4],
             10: lambda d: [*[1, 2, 3, 1, 2, 3, 1, 2, 3], *[1, 2, 3, 1, 2, 3, 1, 2, 3, 6] * 7][
                 d % 79
-            ],  # Used by large separated_enc_dec model with lyrics
+            ], # Used by large separated_enc_dec model with lyrics
             11: lambda d: [6, 6, 0][d % 3] if d % 16 == 15 else [1, 2, 3][d % 3],
             12: lambda d: [7, 7, 0][d % 3]
             if d % 16 == 15
-            else [1, 2, 3][d % 3],  # Used by single_enc_dec model with lyrics
+            else [1, 2, 3][d % 3], # Used by single_enc_dec model with lyrics
         }[attn_order]
 
         def attn_block(d):
@@ -1446,7 +1458,8 @@ def __init__(self, input_shape, width, init_scale=1.0):
         super().__init__()
         self.input_shape = input_shape
         self.input_dims = np.prod(input_shape)
-        self.pos_emb = nn.Parameter(get_normal(self.input_dims, width, std=0.01 * init_scale))
+        self.pos_emb = nn.Parameter(torch.empty((self.input_dims, width)))
+        nn.init.normal_(self.pos_emb, std=0.01 * init_scale)
 
     def forward(self):
         pos_emb = self.pos_emb
@@ -1507,8 +1520,8 @@ def __init__(
         self.metadata_conditioning = metadata_conditioning
         self.audio_conditioning = audio_conditioning
         if not metadata_conditioning:
-            self.start_token = nn.Parameter(get_normal(1, width, std=0.01 * init_scale))
-
+            self.start_token = nn.Parameter(torch.empty((1, width)))
+            nn.init.normal_(self.start_token, std=0.01 * init_scale)
         self.pos_emb = JukeboxPositionalEmbedding(input_shape=input_shape, width=width, init_scale=init_scale)
         self.pos_emb_dropout = nn.Dropout(emb_dropout)
 
@@ -1552,17 +1565,16 @@ def __init__(
     def preprocess(self, tokens):
         # Input: hidden_states is NHWC and uint8. Converted to NL and long
         # Can include stuff like bitpacking, reordering here.
-        N = tokens.shape[0]
-        return tokens.view(N, -1).long()
+        batch_size = tokens.shape[0]
+        return tokens.view(batch_size, -1).long()
 
     def postprocess(self, tokens, sample_tokens=None):
         # Convert back from NL and long to NHWC
-        N = tokens.shape[0]
-        assert (0 <= tokens).all() and (tokens < self.embed_dim).all()
+        batch_size = tokens.shape[0]
         if sample_tokens is None or sample_tokens == self.input_dims:
-            return tokens.view(N, *self.input_shape)
+            return tokens.view(batch_size, *self.input_shape)
         else:
-            return tokens.view(N, -1)
+            return tokens.view(batch_size, -1)
 
     def forward(
         self,
@@ -1582,9 +1594,9 @@ def forward(
         with torch.no_grad():
             tokens = self.preprocess(tokens)
 
-        N = tokens.shape[0]
+        batch_size = tokens.shape[0]
         if not self.audio_conditioning:
-            audio_conditioning = torch.zeros((N, 1, self.width), device=tokens.device, dtype=torch.float)
+            audio_conditioning = torch.zeros((batch_size, 1, self.width), device=tokens.device, dtype=torch.float)
 
         target = tokens  # Target
         hidden_states = self.embed_tokens(tokens)  # music_tokens embedding
@@ -1592,7 +1604,7 @@ def forward(
             (hidden_states[:, -1:], hidden_states[:, :-1]), dim=1
         )  # Shift by 1, and fill in start token
         if self.metadata_conditioning:
-            hidden_states[:, 0] = metadata_conditioning.view(N, self.width)
+            hidden_states[:, 0] = metadata_conditioning.view(batch_size, self.width)
         else:
             hidden_states[:, 0] = self.start_token
 
@@ -1634,16 +1646,15 @@ def forward(
             return loss, None
 
     def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning):
-        N, D = n_samples, self.input_dims
         if sample_t == 0:
             hidden_states = torch.empty(n_samples, 1, self.width).to(audio_conditioning.device)
             if self.metadata_conditioning:
-                hidden_states[:, 0] = metadata_conditioning.view(N, self.width)
+                hidden_states[:, 0] = metadata_conditioning.view(n_samples, self.width)
             else:
                 hidden_states[:, 0] = self.start_token
         else:
             hidden_states = self.embed_tokens(tokens)
-        if audio_conditioning.shape == (N, D, self.width):
+        if audio_conditioning.shape == (n_samples, self.input_dims, self.width):
             cond = audio_conditioning[:, sample_t : sample_t + 1, :]
         else:
             cond = audio_conditioning
@@ -1667,10 +1678,9 @@ def sample(
     ):
         if sample_tokens is None:
             sample_tokens = self.input_dims
-        N = n_samples
 
         if not self.audio_conditioning:
-            audio_conditioning = torch.zeros((N, 1, self.width), dtype=torch.float).to(
+            audio_conditioning = torch.zeros((n_samples, 1, self.width), dtype=torch.float).to(
                 "cpu" if torch.cuda.is_available() else "cpu"
             )
 
@@ -1678,8 +1688,9 @@ def sample(
             sampled_tokens, tokens = [], None
             if get_preds:
                 preds = []
-
-            for sample_t in get_range(range(0, sample_tokens)):
+                
+            iter =  tqdm(range(0, sample_tokens), desc = f"Sampling tokens :")
+            for sample_t in iter:
                 hidden_states, cond = self.get_emb(
                     sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning
                 )
@@ -1740,10 +1751,8 @@ def primed_sample(
         sampled_audio = torch.split(hidden_states, 1, dim=1)
         sampled_audio = list(sampled_audio)
 
-        N = n_samples
-
         if not self.audio_conditioning:
-            audio_conditioning = torch.zeros((N, 1, self.width), dtype=torch.float).to(hidden_states.device)
+            audio_conditioning = torch.zeros((n_samples, 1, self.width), dtype=torch.float).to(hidden_states.device)
 
         with torch.no_grad():
             if get_preds:
@@ -1758,7 +1767,7 @@ def primed_sample(
             start = 0
             hidden_states = None
 
-            for current_chunk_size in get_range(chunk_sizes):
+            for current_chunk_size in tqdm(chunk_sizes, desc="Filling past key value"):
                 sampled_audio_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
                     x_prime, cond_prime = self.get_emb(
@@ -1788,11 +1797,8 @@ def primed_sample(
                 x_prime = self.fc_proj_out(x_prime)  # Predictions
                 preds.append(x_prime)
 
-            # gc.collect()
-            # torch.cuda.empty_cache()
             hidden_states = sampled_audio[-1]
-
-            for sample_t in get_range(range(len(sampled_audio), sample_tokens)):
+            for sample_t in tqdm(range(len(sampled_audio), sample_tokens), desc = f"Sampling {len(sampled_audio)} tokens "):
                 hidden_states, cond = self.get_emb(
                     sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning
                 )
@@ -1802,8 +1808,6 @@ def primed_sample(
                 )  # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
-                if fp16:
-                    hidden_states = hidden_states.half()
                 hidden_states = self.fc_proj_out(hidden_states)  # Predictions
                 if get_preds:
                     preds.append(hidden_states)
@@ -1813,7 +1817,6 @@ def primed_sample(
                 hidden_states = torch.distributions.Categorical(
                     logits=hidden_states
                 ).sample()  # Sample and replace hidden_states
-                assert hidden_states.shape == (n_samples, 1)
                 sampled_audio.append(hidden_states.clone())
 
             del hidden_states
@@ -1829,44 +1832,8 @@ def primed_sample(
             return hidden_states
 
 
-def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
-    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-    Args:
-        logits: logits distribution shape (vocabulary size)
-        top_k >0: keep only top key tokens with highest probability (top-k filtering).
-        top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-    """
-    logits = logits.clone()
-    top_k = min(top_k, logits.size(-1))  # Safety check
-    assert (top_k == 0) or (top_p == 0.0)
-    if top_k > 0:
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k, dim=-1)[0][..., -1:]
-        logits[indices_to_remove] = filter_value
-
-    if top_p > 0.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-
-        # Remove tokens with cumulative probability above the threshold
-        sorted_indices_to_remove = cumulative_probs > top_p
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-
-        # indices_to_remove = sorted_indices[sorted_indices_to_remove]
-        indices_to_remove = torch.zeros_like(logits, dtype=torch.uint8).scatter_(
-            dim=-1, index=sorted_indices, src=sorted_indices_to_remove
-        )
-        logits[indices_to_remove] = filter_value
-    return logits
 
 
-def get_normal(*shape, std=0.01):
-    w = torch.empty(shape)
-    nn.init.normal_(w, std=std)
-    return w
-
 
 class JukeboxMusicTokenConditioner(nn.Module):
     """
@@ -1891,14 +1858,6 @@ def __init__(
         )
         self.layer_norm = JukeboxLayerNorm(self.width)
 
-    def preprocess(self, hidden_states):
-        hidden_states = hidden_states.permute(0, 2, 1)  # NTC -> NCT
-        return hidden_states
-
-    def postprocess(self, hidden_states):
-        hidden_states = hidden_states.permute(0, 2, 1)  # NCT -> NTC
-        return hidden_states
-
     def forward(self, music_tokens, raw_audio_conditionning=None):
         """
         Args :
@@ -1916,9 +1875,9 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
         hidden_states = hidden_states + raw_audio_conditionning
 
         # Run conditioner
-        hidden_states = self.preprocess(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
         hidden_states = self.upsampler(hidden_states)
-        hidden_states = self.postprocess(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
@@ -2004,16 +1963,15 @@ def __init__(
         self.out_width = out_width
         nb_genres, nb_artists = metadata_dims
         self.max_nb_genres = max_nb_genres
-        self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width)
+        self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width) #TODO check if that does not break anything
         self.artist_emb = JukeboxSimpleEmbedding(nb_artists, out_width)
+        # self.bow_genre_emb = nn.Embedding(nb_genres, out_width) #TODO maybe test that
+        # self.artist_emb = nn.Embedding(nb_artists, out_width)
         self.include_time_signal = include_time_signal
         if self.include_time_signal:
-            t_ranges = (
-                (min_duration * sampling_rate, max_duration * sampling_rate),  # Total length
-                (0.0, max_duration * sampling_rate),  # Absolute pos
-                (0.0, 1.0),
-            )  # Relative pos
-            total_length_range, absolute_pos_range, relative_pos_range = t_ranges
+            total_length_range = (min_duration * sampling_rate, max_duration * sampling_rate)
+            absolute_pos_range = (0.0, max_duration * sampling_rate)
+            relative_pos_range = (0.0, 1.0)
             self.total_length_emb = JukeboxRangeEmbedding(1, timing_dims, total_length_range, out_width, init_scale)
             self.absolute_pos_emb = JukeboxRangeEmbedding(
                 n_time, timing_dims, absolute_pos_range, out_width, init_scale
@@ -2023,20 +1981,19 @@ def __init__(
             )
 
     def forward(self, metadata):
-        total_length, offset, length, artist, genre = (
-            metadata[:, 0:1],
-            metadata[:, 1:2],
-            metadata[:, 2:3],
-            metadata[:, 3:4],
-            metadata[:, 4:],
-        )
+        total_length = metadata[:, 0:1]
+        offset = metadata[:, 1:2]
+        length = metadata[:, 2:3]
+        artist = metadata[:, 3:4]
+        genre = metadata[:, 4:]
+        
         # Start embedding of length 1
         artist_emb = self.artist_emb(artist)
         # Empty genre slots are denoted by -1. We mask these out.
         mask = (genre >= 0).float().unsqueeze(2)
         genre_emb = (self.bow_genre_emb(genre.clamp(0)) * mask).sum(dim=1, keepdim=True)
         start_emb = genre_emb + artist_emb
-        # assert_shape(start_emb, (N, 1, self.out_width))
+        # assert_shape(start_emb, (batch_size, 1, self.out_width))
 
         # Pos embedding of length n_ctx
         if self.include_time_signal:
@@ -2118,7 +2075,7 @@ def rescale(music_tokens_shape):
         if config.lyric_conditioning and not config.single_enc_dec[-level - 1]:
             # lyric_enc -> lyric_enc
             lyric_enc_kwargs = dict(
-                embed_dim=config.lyric_enc_n_vocab,  # previously bins
+                embed_dim=config.lyric_enc_n_vocab, # previously bins
                 width=config.lyric_enc_width[-level - 1],
                 depth=config.lyric_enc_depth[-level - 1],
                 heads=config.lyric_enc_heads,
@@ -2250,7 +2207,7 @@ def conditioner_block(_level):
         self.raw_to_tokens = np.prod(self.downsamples[: level + 1])
         self.sample_length = self.n_ctx * self.raw_to_tokens
 
-        print(
+        logger.info(
             f"Level:{level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample"
             f" length:{self.sample_length}"
         )
@@ -2320,14 +2277,14 @@ def prior_preprocess(self, tokens, conds):
         Shifts the input tokens to account for the dictionnary merge. The prior_embed_dim_shift give by how much. the
         music tokens should be shifted by + nb_vocab.
         """
-        N = tokens[0].shape[0]
+        batch_size = tokens[0].shape[0]
         for i in range(len(tokens)):
-            tokens[i] = (tokens[i] + int(self.prior_embed_dim_shift[i])).view(N, -1)
+            tokens[i] = (tokens[i] + int(self.prior_embed_dim_shift[i])).view(batch_size, -1)
 
         for i in range(len(conds)):
             cond, dims = conds[i], self.prior_dims[i]
             if cond is None:
-                conds[i] = torch.zeros((N, dims, self.prior_width), dtype=torch.float, device=tokens[0].device)
+                conds[i] = torch.zeros((batch_size, dims, self.prior_width), dtype=torch.float, device=tokens[0].device)
 
         return torch.cat(tokens, dim=1), torch.cat(conds, dim=1)
 
@@ -2338,7 +2295,7 @@ def prior_postprocess(self, tokens):
          - nb_vocab.
         Returns : only returns the music tokens
         """
-        N = tokens.shape[0]
+        batch_size = tokens.shape[0]
         # dim (nb_lyric_tokens, vqvae_codebook dim = latent_dim of the model)
         dims = (self.prior_dims[0], tokens.shape[1] - self.prior_dims[0])
         tokens = list(torch.split(tokens, dims, dim=1))
@@ -2347,7 +2304,7 @@ def prior_postprocess(self, tokens):
         for i in range(len(tokens)):
             shape = self.prior_shapes[i]
             _, bins_shift = int(self.prior_embed_dim[i]), int(self.prior_embed_dim_shift[i])  # bins, -> _,
-            tokens[i] = (tokens[i] - bins_shift).view(N, -1, *shape[1:])
+            tokens[i] = (tokens[i] - bins_shift).view(batch_size, -1, *shape[1:])
             tokens[i] = torch.clamp(
                 tokens[i], min=0
             )  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
@@ -2426,7 +2383,7 @@ def sample(
         """
         no_past_context = music_tokens is None or music_tokens.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
-        print(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
+        logger.info(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
 
         with torch.no_grad():
             # Currently audio_conditioning only uses immediately above layer
@@ -2640,11 +2597,8 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
     attn_layers = set([alignment_layer])
     alignment_hops = {}
     indices_hops = {}
-    # prior.to(tokens.device)
     prior.to("cuda")
-    # gc.collect()
-    # torch.cuda.empty_cache()
-    for start in get_range(get_starts(total_length, n_ctx, hop_length)):
+    for start in tqdm(get_starts(total_length, n_ctx, hop_length), desc = "Sampling {n_ctx} tokens"):
         end = start + n_ctx
         # set metadata offset, sample_length and lyrics tokens
         metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
@@ -2691,46 +2645,15 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
 
 
 def save_wav(fname, lvl, metas, aud, sampling_rate):
-    import soundfile
-
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
+    return
     for i in list(range(aud.shape[0])):
         if metas is not None:
-            # twitter prompts or inputs are in the form of a dictionnary
             artists, genres, lyrics = list(metas)[i].values()
             path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}.wav"
-            soundfile.write(path, aud[i], samplerate=sampling_rate, format="wav")
+            np.save(path, aud[i])
         else:
-            soundfile.write(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i], samplerate=sampling_rate, format="wav")
-
-
-def load_audio(file, sampling_rate, offset, duration, mono=False):
-    import librosa
-
-    # Librosa loads more filetypes than soundfile
-    raw_audio, _ = librosa.load(
-        file, sr=sampling_rate, mono=mono, offset=offset / sampling_rate, duration=duration / sampling_rate
-    )
-    if len(raw_audio.shape) == 1:
-        raw_audio = raw_audio.reshape((1, -1))
-    return raw_audio
-
-
-def load_prompts(audio_files, hps, sample_length_in_seconds=70, offset_in_seconds=10):
-    duration = sample_length_in_seconds * hps.sampling_rate
-    offset = offset_in_seconds * hps.sampling_rate
-    raw_audio_list = []
-    for audio_file in audio_files:
-        raw_audio = load_audio(
-            audio_file, sampling_rate=hps.sampling_rate, duration=duration, offset=offset, mono=True
-        )
-        raw_audio = raw_audio.T  # CT -> TC
-        raw_audio_list.append(raw_audio)
-    while len(raw_audio_list) < len(audio_files):
-        raw_audio_list.extend(raw_audio_list)
-    raw_audio_list = raw_audio_list[: len(audio_files)]
-    raw_audio = torch.stack([torch.from_numpy(raw_audio) for raw_audio in raw_audio_list])
-    return raw_audio
+            np.save(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i])
 
 
 JUKEBOX_SAMPLING_INPUT_DOCSTRING = r"""
@@ -2814,7 +2737,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
             sample_tokens - previous_sampled_tokens.shape[1],
         )
 
-        print(
+        logger.info(
             f"Sampling {sample_tokens} tokens for [{start},{start+sample_tokens}]. Conditioning on"
             f" {conditioning_tokens} tokens"
         )
@@ -2861,9 +2784,9 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
 
     # Sample total_length tokens at level=level with hop_length=hop_length
     def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length):
-        print(f"Sampling level {level}")
+        logger.info(f"Sampling level {level}")
         if total_length >= self.priors[level].n_ctx:
-            for start in get_range(get_starts(total_length, self.priors[level].n_ctx, hop_length)):
+            for start in tqdm(get_starts(total_length, self.priors[level].n_ctx, hop_length), "Sampling at each level"):
                 music_tokens = self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start)
 
         else:
@@ -2937,16 +2860,21 @@ def _sample(
 
         Example:
         ```python
+        >>> from transformers import JukeboxTokenizer, JukeboxModel, set_seed
+        >>> import torch
         >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
-        >>> tokenizer = JukeboxTokenizer.from_pretrained(self.model_id)
-        >>> model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-
-        >>> tokens = tokenizer(**self.metas)["input_ids"]
+        >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
 
-        >>> zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        >>> labels = tokenizer(**metas)["input_ids"]
+        >>> set_seed(0)
+        >>> zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
         >>> zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
-        [1864,1536, 1213, 1870, 1357, 1536, 519, 880, 1323, 789, 1082, 534, 1000, 1445, 1105, 1130, 967, 515, 1434, 1620,
-         534, 1495, 283, 1445, 333, 1307, 539, 1631, 1528, 375, 1434, 673, 627, 710, 778, 1883, 1405, 1276, 1455, 1228]
+        >>> zs[-1]
+        tensor([[1853, 1369, 1150, 1869, 1379, 1789,  519,  710, 1306, 1100, 1229,  519,
+              353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
+             1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
+             1804,  541, 1804, 1434]])
         ```"""
 
         top_prior = self.priors[-1]
@@ -2989,8 +2917,6 @@ def _sample(
 
             # from the actual generated length
             self.priors[level].to(music_tokens[level].device).eval()
-            # gc.collect()
-            # torch.cuda.empty_cache()
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
             # self.priors[level].total_length = total_length // self.priors[level].raw_to_tokens
             total_token_to_sample = total_length // self.priors[level].raw_to_tokens
@@ -3001,8 +2927,6 @@ def _sample(
             )
 
             self.priors[level].to("cpu")
-            # gc.collect()
-            # torch.cuda.empty_cache()
             self.vqvae.to(music_tokens[level].device)
             # Decode sample
             with torch.no_grad():
@@ -3017,8 +2941,6 @@ def _sample(
                     os.makedirs(logdir)
                 save_wav(logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate)
                 if compute_alignments and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
-                    # gc.collect()
-                    # torch.cuda.empty_cache()
                     with torch.no_grad():
                         alignments = get_alignment(music_tokens, labels[-1], self.priors[-1], fp16, self.config)
                     torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")
@@ -3039,23 +2961,20 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
         Example:
 
         ```python
-        >>> from transformers import JukeboxTokenizer, JukeboxModel
-
-        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0)
-        Level:0, Cond downsample:4, Raw to tokens:8, Sample length:65536
-        Level:1, Cond downsample:4, Raw to tokens:32, Sample length:262144
-        Level:2, Cond downsample:None, Raw to tokens:128, Sample length:786432
-
+        >>> from transformers import JukeboxTokenizer, JukeboxModel, set_seed
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
         >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+        
         >>> lyrics = "Hey, are you awake? Can you talk to me?"
         >>> artist = "Zac Brown Band"
         >>> genre = "Country"
         >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
+        >>> set_seed(0)
+        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400)
 
-        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length_in_seconds=2)
-
-        >>> model.decode(music_tokens)[:, :10]
-        tensor([[-0.0006], [ 0.0009], [ 0.0005], [-0.0010], [-0.0010], [-0.0004],[-0.0002], [-0.0004], [-0.0005], [ 0.0002]]
+        >>> model.decode(music_tokens)[:, :10].squeeze(-1)
+        tensor([[-0.0003, -0.0012,  0.0009,  0.0012,  0.0018,  0.0003, -0.0015, -0.0020,
+                 -0.0013,  0.0010]])
         ```"""
 
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))

From 273f1251f125655004e2622d6052760823ed747a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 19 Oct 2022 09:50:09 +0000
Subject: [PATCH 134/196] update tqdm usage

---
 .../models/jukebox/modeling_jukebox.py        | 273 +++++++++---------
 1 file changed, 139 insertions(+), 134 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 831114ba1e28b..a9302f2cc1bf7 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -26,11 +26,11 @@
 import torch.nn.functional as F
 from torch import nn
 from torch.nn import LayerNorm as FusedLayerNorm
-from ...utils.logging import tqdm
 
 from ...activations import ACT2FN
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, logging
+from ...utils.logging import tqdm
 from .configuration_jukebox import JukeboxConfig
 
 
@@ -43,8 +43,6 @@
 ]
 
 
-
-
 def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
     """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
     Args:
@@ -109,6 +107,95 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
     return tokens.unsqueeze(dim=0), indices
 
 
+# Break total_length into hops/windows of size n_ctx separated by hop_length
+def get_starts(total_length, n_ctx, hop_length):
+    starts = []
+    for start in range(0, total_length - n_ctx + hop_length, hop_length):
+        if start + n_ctx >= total_length:
+            # Last hop could be smaller, we make it n_ctx to maximise context
+            start = total_length - n_ctx
+        starts.append(start)
+    return starts
+
+
+def get_alignment(music_tokens, labels, prior, fp16, config):
+    level = prior.levels - 1  # Top level used
+    n_ctx = prior.n_ctx
+    tokens = music_tokens[level]
+    batch_size, total_length = tokens.shape[0], tokens.shape[1]
+    if total_length < n_ctx:
+        padding_length = n_ctx - total_length
+        tokens = torch.cat(
+            [tokens, torch.zeros(batch_size, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1
+        )
+        total_length = tokens.shape[1]
+    else:
+        padding_length = 0
+
+    hop_length = int(config.hop_fraction[-level - 1] * prior.n_ctx)
+    alignment_head, alignment_layer = config.prior_alignment_head[0], config.prior_alignment_layer[0]
+    attn_layers = set([alignment_layer])
+    alignment_hops = {}
+    indices_hops = {}
+    prior.to("cuda")
+    for start in tqdm(get_starts(total_length, n_ctx, hop_length), desc="Computing lyric to music alignment "):
+        end = start + n_ctx
+        # set metadata offset, sample_length and lyrics tokens
+        metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
+        metadata.to("cuda")
+        tokens_bs = torch.chunk(tokens, batch_size, dim=0)
+        metadata_bs = torch.chunk(metadata, batch_size, dim=0)
+        w_hops = []
+        for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
+            tokens_i = tokens_i.to("cuda")
+            metadata_i = metadata_i.to("cuda")
+            w_hop = prior.forward_tokens(
+                tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers
+            )
+            w_hops.append(w_hop[0][:, alignment_head])
+            del w_hop
+        w = torch.cat(w_hops, dim=0)
+        del w_hops
+        alignment_hop = w.float().cpu().numpy()
+        del w
+
+        # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens)
+        # indices_hop is a list of len=bs, each entry of len hps.nb_relevant_lyric_tokens
+        indices_hops[start] = indices_hop
+        alignment_hops[start] = alignment_hop
+    prior.cpu()
+    # gc.collect()
+    # torch.cuda.empty_cache()
+
+    # Combine attn for each hop into attn for full range
+    # Use indices to place them into correct place for corresponding source tokens
+    alignments = []
+    for item in range(batch_size):
+        # Note each item has different length lyrics
+        full_tokens = labels[0, 3:]
+        alignment = np.zeros((total_length, len(full_tokens) + 1))
+        for start in reversed(get_starts(total_length, n_ctx, hop_length)):
+            end = start + n_ctx
+            alignment_hop = alignment_hops[start][item]
+            indices = indices_hops[start][item]
+            alignment[start:end, indices] = alignment_hop
+        alignment = alignment[: total_length - padding_length, :-1]  # remove token padding, and last lyric index
+        alignments.append(alignment)
+    return alignments
+
+
+def save_wav(fname, lvl, metas, aud, sampling_rate):
+    aud = torch.clamp(aud, -1, 1).cpu().numpy()
+    return
+    for i in list(range(aud.shape[0])):
+        if metas is not None:
+            artists, genres, lyrics = list(metas)[i].values()
+            path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}.wav"
+            np.save(path, aud[i])
+        else:
+            np.save(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i])
+
+
 def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t):
     # returns a mask of shape 1 x 1 x query_length x key_value_length or None if masking is not needed.
     if mask is None or query_length == 1:
@@ -134,6 +221,7 @@ def get_mask(mask, query_length, key_value_length, blocks, spread, device, sampl
         mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
     return mask.view(1, 1, query_length, key_value_length)
 
+
 class JukeboxConv1D(nn.Module):
     def __init__(self, n_in, n_out, zero_out=False):
         super(JukeboxConv1D, self).__init__()
@@ -438,7 +526,9 @@ def update_codebook(self, hidden_states, latent_states):
     def preprocess(self, hidden_states):
         # NCT -> NTC -> [NT, C]
         hidden_states = hidden_states.permute(0, 2, 1).contiguous()
-        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # x_en = (batch_size *L, w), k_j = (w, codebook_dim)
+        hidden_states = hidden_states.view(
+            -1, hidden_states.shape[-1]
+        )  # x_en = (batch_size *L, w), k_j = (w, codebook_dim)
 
         if hidden_states.shape[-1] == self.codebook_width:
             prenorm = torch.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(np.prod(hidden_states.shape))
@@ -774,7 +864,9 @@ def forward(self, raw_audio):
         >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
         >>> set_seed(0)
         >>> zs = [torch.random(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        >>> audio = model.decode(zs)
+        >>> model.decode(zs)
+
+
         ```"""
 
         # Encode/Decode
@@ -828,8 +920,6 @@ def forward(self, input):
             return super(JukeboxLayerNorm, self).forward(input).type_as(input)
 
 
-
-
 class JukeboxAttention(nn.Module):
     def __init__(
         self,
@@ -868,11 +958,11 @@ def __init__(
         # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
         self.attn_func = attn_func
         self.qkv, self.attn, self.attn_mask = {
-            0: (self.factored_qkv, self.dense_attn, "autoregressive"), # Attend to all positions
-            1: (self.factored_qkv, self.block_attn, "autoregressive"), # Attend to your block
-            2: (self.factored_qkv, self.transpose_block_attn, "autoregressive"), # Attend to transpose block
-            3: (self.factored_qkv, self.prev_block_attn, None), # Attend to previous block
-            4: (self.factored_qkv, self.summary_attn, "summary"), # Attend to last position of each block
+            0: (self.factored_qkv, self.dense_attn, "autoregressive"),  # Attend to all positions
+            1: (self.factored_qkv, self.block_attn, "autoregressive"),  # Attend to your block
+            2: (self.factored_qkv, self.transpose_block_attn, "autoregressive"),  # Attend to transpose block
+            3: (self.factored_qkv, self.prev_block_attn, None),  # Attend to previous block
+            4: (self.factored_qkv, self.summary_attn, "summary"),  # Attend to last position of each block
             5: (self.factored_qkv, self.summary_spread_attn, "summary"),
             6: (self.decode_qkv, self.decode_attn, None),
             7: (self.prime_qkv, self.prime_attn, "prime"),
@@ -1361,23 +1451,23 @@ def __init__(
 
         # Orders of attn_func
         attn_func = {
-            0: lambda d: 0, # Complete dense attn
-            1: lambda d: [1, 2][d % 2], # Alternate row and column attn
-            2: lambda d: [1, 2, 3][d % 3], # Alternate row, column and previous row attn
-            3: lambda d: [1, 4][d % 2], # Alternate row and last column
-            4: lambda d: [1, 5][d % 2], # Alternate row and last k columns
-            5: lambda d: [1, 4, 1, 1][d % 4], # Alternate row, last column, row, row
+            0: lambda d: 0,  # Complete dense attn
+            1: lambda d: [1, 2][d % 2],  # Alternate row and column attn
+            2: lambda d: [1, 2, 3][d % 3],  # Alternate row, column and previous row attn
+            3: lambda d: [1, 4][d % 2],  # Alternate row and last column
+            4: lambda d: [1, 5][d % 2],  # Alternate row and last k columns
+            5: lambda d: [1, 4, 1, 1][d % 4],  # Alternate row, last column, row, row
             6: lambda d: [1, 2, 3, 6][d % 4],
             7: lambda d: [*[1, 2, 3] * 5, 6][d % 16],
-            8: lambda d: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][d % 10], # Used by separated_enc_dec model with lyrics
+            8: lambda d: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][d % 10],  # Used by separated_enc_dec model with lyrics
             9: lambda d: [1, 2, 3, 0][d % 4],
             10: lambda d: [*[1, 2, 3, 1, 2, 3, 1, 2, 3], *[1, 2, 3, 1, 2, 3, 1, 2, 3, 6] * 7][
                 d % 79
-            ], # Used by large separated_enc_dec model with lyrics
+            ],  # Used by large separated_enc_dec model with lyrics
             11: lambda d: [6, 6, 0][d % 3] if d % 16 == 15 else [1, 2, 3][d % 3],
             12: lambda d: [7, 7, 0][d % 3]
             if d % 16 == 15
-            else [1, 2, 3][d % 3], # Used by single_enc_dec model with lyrics
+            else [1, 2, 3][d % 3],  # Used by single_enc_dec model with lyrics
         }[attn_order]
 
         def attn_block(d):
@@ -1688,9 +1778,10 @@ def sample(
             sampled_tokens, tokens = [], None
             if get_preds:
                 preds = []
-                
-            iter =  tqdm(range(0, sample_tokens), desc = f"Sampling tokens :")
+
+            iter = tqdm(range(0, sample_tokens))
             for sample_t in iter:
+                iter.set_description(f"Ancestral sampling {sample_tokens} music tokens", refresh=True)
                 hidden_states, cond = self.get_emb(
                     sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning
                 )
@@ -1767,7 +1858,7 @@ def primed_sample(
             start = 0
             hidden_states = None
 
-            for current_chunk_size in tqdm(chunk_sizes, desc="Filling past key value"):
+            for current_chunk_size in tqdm(chunk_sizes, desc="Preparing past key value", leave=False):
                 sampled_audio_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
                     x_prime, cond_prime = self.get_emb(
@@ -1798,7 +1889,10 @@ def primed_sample(
                 preds.append(x_prime)
 
             hidden_states = sampled_audio[-1]
-            for sample_t in tqdm(range(len(sampled_audio), sample_tokens), desc = f"Sampling {len(sampled_audio)} tokens "):
+
+            iter = tqdm(range(len(sampled_audio), sample_tokens))
+            for sample_t in iter:
+                iter.set_description(f"Primed sampling {len(iter)} music tokens", refresh=True)
                 hidden_states, cond = self.get_emb(
                     sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning
                 )
@@ -1832,9 +1926,6 @@ def primed_sample(
             return hidden_states
 
 
-
-
-
 class JukeboxMusicTokenConditioner(nn.Module):
     """
     The JukeboxMusicTokenConditioner takes music tokens as an input (coresponding to vocabularies in the VQ-VAE
@@ -1963,7 +2054,7 @@ def __init__(
         self.out_width = out_width
         nb_genres, nb_artists = metadata_dims
         self.max_nb_genres = max_nb_genres
-        self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width) #TODO check if that does not break anything
+        self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width)  # TODO check if that does not break anything
         self.artist_emb = JukeboxSimpleEmbedding(nb_artists, out_width)
         # self.bow_genre_emb = nn.Embedding(nb_genres, out_width) #TODO maybe test that
         # self.artist_emb = nn.Embedding(nb_artists, out_width)
@@ -1986,7 +2077,7 @@ def forward(self, metadata):
         length = metadata[:, 2:3]
         artist = metadata[:, 3:4]
         genre = metadata[:, 4:]
-        
+
         # Start embedding of length 1
         artist_emb = self.artist_emb(artist)
         # Empty genre slots are denoted by -1. We mask these out.
@@ -1998,7 +2089,9 @@ def forward(self, metadata):
         # Pos embedding of length n_ctx
         if self.include_time_signal:
             start, end = offset, offset + length
-            total_length, start, end = total_length.float(), start.float(), end.float()
+            total_length = total_length.float()
+            start = start.float()
+            end = end.float()
             pos_emb = (
                 self.total_length_emb(total_length)
                 + self.absolute_pos_emb(start, end)
@@ -2075,7 +2168,7 @@ def rescale(music_tokens_shape):
         if config.lyric_conditioning and not config.single_enc_dec[-level - 1]:
             # lyric_enc -> lyric_enc
             lyric_enc_kwargs = dict(
-                embed_dim=config.lyric_enc_n_vocab, # previously bins
+                embed_dim=config.lyric_enc_n_vocab,
                 width=config.lyric_enc_width[-level - 1],
                 depth=config.lyric_enc_depth[-level - 1],
                 heads=config.lyric_enc_heads,
@@ -2284,7 +2377,9 @@ def prior_preprocess(self, tokens, conds):
         for i in range(len(conds)):
             cond, dims = conds[i], self.prior_dims[i]
             if cond is None:
-                conds[i] = torch.zeros((batch_size, dims, self.prior_width), dtype=torch.float, device=tokens[0].device)
+                conds[i] = torch.zeros(
+                    (batch_size, dims, self.prior_width), dtype=torch.float, device=tokens[0].device
+                )
 
         return torch.cat(tokens, dim=1), torch.cat(conds, dim=1)
 
@@ -2379,7 +2474,7 @@ def sample(
         sample_tokens=None,
     ):
         """
-        Ancestral sampling a window of tokens using the provided conditioning and metadatas
+        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas
         """
         no_past_context = music_tokens is None or music_tokens.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
@@ -2452,13 +2547,9 @@ def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
                 self.lyric_encoder = self.lyric_encoder.to(lyric_tokens.device)
             lyric_acts = self.lyric_encoder(lyric_tokens, None, None, None, fp16=fp16)
             lyric_acts = self.lyric_encoder.proj_in(lyric_acts)
-            if fp16:
-                lyric_acts = lyric_acts.half()
             lyric_encoder_states = self.lyric_encoder.final_layer_norm(lyric_acts)
             if sample:
                 self.lyric_encoder.cpu()
-                if fp16:
-                    lyric_encoder_states = lyric_encoder_states.half()
         else:
             lyric_encoder_states = None
         return lyric_encoder_states
@@ -2567,95 +2658,6 @@ def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
 
-# Break total_length into hops/windows of size n_ctx separated by hop_length
-def get_starts(total_length, n_ctx, hop_length):
-    starts = []
-    for start in range(0, total_length - n_ctx + hop_length, hop_length):
-        if start + n_ctx >= total_length:
-            # Last hop could be smaller, we make it n_ctx to maximise context
-            start = total_length - n_ctx
-        starts.append(start)
-    return starts
-
-
-def get_alignment(music_tokens, labels, prior, fp16, config):
-    level = prior.levels - 1  # Top level used
-    n_ctx = prior.n_ctx
-    tokens = music_tokens[level]
-    batch_size, total_length = tokens.shape[0], tokens.shape[1]
-    if total_length < n_ctx:
-        padding_length = n_ctx - total_length
-        tokens = torch.cat(
-            [tokens, torch.zeros(batch_size, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1
-        )
-        total_length = tokens.shape[1]
-    else:
-        padding_length = 0
-
-    hop_length = int(config.hop_fraction[-level - 1] * prior.n_ctx)
-    alignment_head, alignment_layer = config.prior_alignment_head[0], config.prior_alignment_layer[0]
-    attn_layers = set([alignment_layer])
-    alignment_hops = {}
-    indices_hops = {}
-    prior.to("cuda")
-    for start in tqdm(get_starts(total_length, n_ctx, hop_length), desc = "Sampling {n_ctx} tokens"):
-        end = start + n_ctx
-        # set metadata offset, sample_length and lyrics tokens
-        metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
-        metadata.to("cuda")
-        tokens_bs = torch.chunk(tokens, batch_size, dim=0)
-        metadata_bs = torch.chunk(metadata, batch_size, dim=0)
-        w_hops = []
-        for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
-            tokens_i = tokens_i.to("cuda")
-            metadata_i = metadata_i.to("cuda")
-            w_hop = prior.forward_tokens(
-                tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers
-            )
-            w_hops.append(w_hop[0][:, alignment_head])
-            del w_hop
-        w = torch.cat(w_hops, dim=0)
-        del w_hops
-        alignment_hop = w.float().cpu().numpy()
-        del w
-
-        # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens)
-        # indices_hop is a list of len=bs, each entry of len hps.nb_relevant_lyric_tokens
-        indices_hops[start] = indices_hop
-        alignment_hops[start] = alignment_hop
-    prior.cpu()
-    # gc.collect()
-    # torch.cuda.empty_cache()
-
-    # Combine attn for each hop into attn for full range
-    # Use indices to place them into correct place for corresponding source tokens
-    alignments = []
-    for item in range(batch_size):
-        # Note each item has different length lyrics
-        full_tokens = labels[0, 3:]
-        alignment = np.zeros((total_length, len(full_tokens) + 1))
-        for start in reversed(get_starts(total_length, n_ctx, hop_length)):
-            end = start + n_ctx
-            alignment_hop = alignment_hops[start][item]
-            indices = indices_hops[start][item]
-            alignment[start:end, indices] = alignment_hop
-        alignment = alignment[: total_length - padding_length, :-1]  # remove token padding, and last lyric index
-        alignments.append(alignment)
-    return alignments
-
-
-def save_wav(fname, lvl, metas, aud, sampling_rate):
-    aud = torch.clamp(aud, -1, 1).cpu().numpy()
-    return
-    for i in list(range(aud.shape[0])):
-        if metas is not None:
-            artists, genres, lyrics = list(metas)[i].values()
-            path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}.wav"
-            np.save(path, aud[i])
-        else:
-            np.save(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i])
-
-
 JUKEBOX_SAMPLING_INPUT_DOCSTRING = r"""
             labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_lenght)` :
                 List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
@@ -2762,9 +2764,9 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         music_tokens_conds_list = self.split_batch(music_tokens_conds, n_samples, max_batch_size)
         metadata_list = self.split_batch(metadata, n_samples, max_batch_size)
         tokens = []
-        for music_tokens_i, music_tokens_conds_i, metadata_i in zip(
-            music_tokens_list, music_tokens_conds_list, metadata_list
-        ):
+        iterator = tqdm(zip(music_tokens_list, music_tokens_conds_list, metadata_list))
+        for music_tokens_i, music_tokens_conds_i, metadata_i in iterator:
+            iterator.set_description(f"Sampling windows of {sample_tokens}")
             tokens_i = prior.sample(
                 n_samples=music_tokens_i.shape[0],
                 music_tokens=music_tokens_i,
@@ -2784,9 +2786,12 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
 
     # Sample total_length tokens at level=level with hop_length=hop_length
     def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length):
-        logger.info(f"Sampling level {level}")
         if total_length >= self.priors[level].n_ctx:
-            for start in tqdm(get_starts(total_length, self.priors[level].n_ctx, hop_length), "Sampling at each level"):
+            iterator = tqdm(get_starts(total_length, self.priors[level].n_ctx, hop_length))
+            for start in get_starts(total_length, self.priors[level].n_ctx, hop_length):
+                iterator.set_description(
+                    f"[prior level {level}] Sampling {self.priors[level].n_ctx}/{total_length} tokens", refresh=True
+                )
                 music_tokens = self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start)
 
         else:
@@ -2964,7 +2969,7 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
         >>> from transformers import JukeboxTokenizer, JukeboxModel, set_seed
         >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
         >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
-        
+
         >>> lyrics = "Hey, are you awake? Can you talk to me?"
         >>> artist = "Zac Brown Band"
         >>> genre = "Country"
@@ -2972,7 +2977,7 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
         >>> set_seed(0)
         >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400)
 
-        >>> model.decode(music_tokens)[:, :10].squeeze(-1)
+        >>> with torch.no_grad():model.decode(music_tokens)[:, :10].squeeze(-1)
         tensor([[-0.0003, -0.0012,  0.0009,  0.0012,  0.0018,  0.0003, -0.0015, -0.0020,
                  -0.0013,  0.0010]])
         ```"""

From 3ac4ed931709bdfdb55bd809c23e28143a46123c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 19 Oct 2022 11:00:27 +0000
Subject: [PATCH 135/196] fixup

---
 README_es.md                                        |  1 +
 src/transformers/models/jukebox/modeling_jukebox.py | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/README_es.md b/README_es.md
index 8e6ad7d902a37..1df65e9f0cb05 100644
--- a/README_es.md
+++ b/README_es.md
@@ -317,6 +317,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index a9302f2cc1bf7..85486ec2046ad 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -14,10 +14,8 @@
 # limitations under the License.
 """PyTorch Jukebox model."""
 
-import gc
 import math
 import os
-import sys
 import time
 from typing import List
 
@@ -861,12 +859,11 @@ def forward(self, raw_audio):
         Example:
         ```python
         >>> from transformers import JukeboxVQVAE, set_seed
+
         >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
         >>> set_seed(0)
         >>> zs = [torch.random(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         >>> model.decode(zs)
-
-
         ```"""
 
         # Encode/Decode
@@ -2867,6 +2864,7 @@ def _sample(
         ```python
         >>> from transformers import JukeboxTokenizer, JukeboxModel, set_seed
         >>> import torch
+
         >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
         >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
         >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
@@ -2967,6 +2965,7 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
 
         ```python
         >>> from transformers import JukeboxTokenizer, JukeboxModel, set_seed
+
         >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
         >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
 
@@ -2977,7 +2976,8 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
         >>> set_seed(0)
         >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400)
 
-        >>> with torch.no_grad():model.decode(music_tokens)[:, :10].squeeze(-1)
+        >>> with torch.no_grad():
+        ...     model.decode(music_tokens)[:, :10].squeeze(-1)
         tensor([[-0.0003, -0.0012,  0.0009,  0.0012,  0.0018,  0.0003, -0.0015, -0.0020,
                  -0.0013,  0.0010]])
         ```"""

From 13119f309eb7831b00b80367653fb2548dfcac17 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 21 Oct 2022 12:10:46 +0200
Subject: [PATCH 136/196] Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/jukebox/modeling_jukebox.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 85486ec2046ad..4070100f22f1e 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -694,7 +694,8 @@ def __init__(self, config):
             top_raw_to_tokens = np.prod(downsamples)
             config.sample_length = (
                 (config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens) * top_raw_to_tokens
-            ).astype(int)
+            )
+            config.sample_length = config.sample_length.astype(int)
 
         input_shape = (config.sample_length, 1)
         block_kwargs = dict(
@@ -977,7 +978,6 @@ def __init__(
         self.encoder_dims = encoder_dims
         self.lyric_enc_len = lyric_enc_len
         self.record_attn = False
-        self.w = None
 
     def _attn(self, query_states, key_states, value_states, sample):
         scale = 1.0 / math.sqrt(math.sqrt(self.hidden_dim // self.num_heads))

From 6bada42411a62c60ee14f179c858867677bd3c45 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 21 Oct 2022 10:33:03 +0000
Subject: [PATCH 137/196] Update code based on review

---
 .../models/jukebox/modeling_jukebox.py        | 352 ++++++++----------
 1 file changed, 152 insertions(+), 200 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 85486ec2046ad..648b17aca7085 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -50,7 +50,7 @@ def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
     """
     logits = logits.clone()
     top_k = min(top_k, logits.size(-1))  # Safety check
-    assert (top_k == 0) or (top_p == 0.0)
+
     if top_k > 0:
         # Remove all tokens with a probability less than the last token of the top-k
         indices_to_remove = logits < torch.topk(logits, top_k, dim=-1)[0][..., -1:]
@@ -83,7 +83,7 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
 
     Args:
         full_tokens (`List[int]`):
-            List containing the ids of the entire lyrics.
+            List containing the token ids of the entire lyrics.
         total_length (`int`):
             Total expected length of the music (not all of it is generated, see duration), in samples.
         offset (`int`):
@@ -116,7 +116,7 @@ def get_starts(total_length, n_ctx, hop_length):
     return starts
 
 
-def get_alignment(music_tokens, labels, prior, fp16, config):
+def get_alignment(music_tokens, labels, prior, config):
     level = prior.levels - 1  # Top level used
     n_ctx = prior.n_ctx
     tokens = music_tokens[level]
@@ -148,7 +148,7 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
             tokens_i = tokens_i.to("cuda")
             metadata_i = metadata_i.to("cuda")
             w_hop = prior.forward_tokens(
-                tokens_i[:, start:end], [], metadata_i, fp16=fp16, get_attn_weights=attn_layers
+                tokens_i[:, start:end], [], metadata_i, get_attn_weights=attn_layers
             )
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
@@ -162,8 +162,6 @@ def get_alignment(music_tokens, labels, prior, fp16, config):
         indices_hops[start] = indices_hop
         alignment_hops[start] = alignment_hop
     prior.cpu()
-    # gc.collect()
-    # torch.cuda.empty_cache()
 
     # Combine attn for each hop into attn for full range
     # Use indices to place them into correct place for corresponding source tokens
@@ -687,6 +685,7 @@ def forward(self, input_audio):
     JUKEBOX_START_DOCSTRING,
 )
 class JukeboxVQVAE(PreTrainedModel):
+    config_class = JukeboxConfig
     def __init__(self, config):
         super().__init__(config)
         if not config.sample_length:
@@ -859,11 +858,12 @@ def forward(self, raw_audio):
         Example:
         ```python
         >>> from transformers import JukeboxVQVAE, set_seed
-
-        >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
+        >>> import torch
+        >>> model = JukeboxVQVAE.from_pretrained("ArthurZ/vqvae-dummy").eval()
         >>> set_seed(0)
-        >>> zs = [torch.random(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        >>> model.decode(zs)
+        >>> zs = [torch.randint(100,(4,1))]
+        >>> model.decode(zs).shape
+        torch.Size([4, 8, 1])
         ```"""
 
         # Encode/Decode
@@ -954,15 +954,22 @@ def __init__(
 
         # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
         self.attn_func = attn_func
-        self.qkv, self.attn, self.attn_mask = {
-            0: (self.factored_qkv, self.dense_attn, "autoregressive"),  # Attend to all positions
-            1: (self.factored_qkv, self.block_attn, "autoregressive"),  # Attend to your block
-            2: (self.factored_qkv, self.transpose_block_attn, "autoregressive"),  # Attend to transpose block
-            3: (self.factored_qkv, self.prev_block_attn, None),  # Attend to previous block
-            4: (self.factored_qkv, self.summary_attn, "summary"),  # Attend to last position of each block
-            5: (self.factored_qkv, self.summary_spread_attn, "summary"),
-            6: (self.decode_qkv, self.decode_attn, None),
-            7: (self.prime_qkv, self.prime_attn, "prime"),
+        if attn_func == 6 : 
+            self.qkv = self.decode_qkv 
+        elif attn_func == 7:
+            self.qkv = self.prime_qkv
+        else:
+            self.qkv = self.factored_qkv
+            
+        self.attn, self.attn_mask = {
+            0: (self.dense_attn, "autoregressive"),  # Attend to all positions
+            1: (self.block_attn, "autoregressive"),  # Attend to your block
+            2: (self.transpose_block_attn, "autoregressive"),  # Attend to transpose block
+            3: (self.prev_block_attn, None),  # Attend to previous block
+            4: (self.summary_attn, "summary"),  # Attend to last position of each block
+            5: (self.summary_spread_attn, "summary"),
+            6: (self.dense_attn, None),
+            7: (self.prime_attn, "prime"),
         }[
             attn_func
         ]  # Attend to last key position of each block
@@ -1039,13 +1046,9 @@ def dense_attn(self, query, key, value, sample):
         return context_states
 
     def block_attn(self, query, key, value, sample):
-        _, block_ctx = (
-            self.blocks,
-            self.block_ctx,
-        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        block_ctx = self.block_ctx
         batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
-            assert seq_len == self._suff_cache_len(), f"{seq_len} != {self._suff_cache_len()}"
             return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             query_length = query.shape[1]
@@ -1059,10 +1062,7 @@ def block_attn(self, query, key, value, sample):
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def transpose_block_attn(self, query, key, value, sample):
-        _, block_ctx = (
-            self.blocks,
-            self.block_ctx,
-        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        block_ctx = self.block_ctx
         batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
             block_len = (seq_len - 1) % block_ctx
@@ -1071,44 +1071,32 @@ def transpose_block_attn(self, query, key, value, sample):
             return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
             query_length = query.shape[1]
-            query = (
-                query.view(batch_size, query_length // block_ctx, block_ctx, embed_dim)
-                .transpose(1, 2)
-                .contiguous()
-                .view(batch_size * block_ctx, query_length // block_ctx, embed_dim)
-            )
-            key = (
-                key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
-                .transpose(1, 2)
-                .contiguous()
-                .view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
-            )
-            value = (
-                value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
-                .transpose(1, 2)
-                .contiguous()
-                .view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
-            )
-            return (
-                self.dense_attn(query, key, value, sample)
-                .view(batch_size, block_ctx, query_length // block_ctx, embed_dim)
-                .transpose(1, 2)
-                .contiguous()
-                .view(batch_size, query_length, embed_dim)
-            )
+            query = query.view(batch_size, query_length // block_ctx, block_ctx, embed_dim)
+            query = query.transpose(1, 2).contiguous()
+            query = query.view(batch_size * block_ctx, query_length // block_ctx, embed_dim)
+
+            key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
+            key = key.transpose(1, 2).contiguous()
+            key = key.view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
+
+            value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
+            value = value.transpose(1, 2).contiguous()
+            value = value.view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
+
+            block_attn = self.dense_attn(query, key, value, sample)
+            block_attn = block_attn.view(batch_size, block_ctx, query_length // block_ctx, embed_dim)
+            block_attn = block_attn.transpose(1, 2).contiguous()
+            block_attn = block_attn.view(batch_size, query_length, embed_dim)
+            
+            return block_attn
 
     def prev_block_attn(self, query, key, value, sample):
-        _, block_ctx = (
-            self.blocks,
-            self.block_ctx,
-        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        block_ctx = self.block_ctx
         batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
-            assert seq_len == self._suff_cache_len(), f"{seq_len} != {self._suff_cache_len()}"
             block = (seq_len - 1) // block_ctx
             prev_l = (block - 1) * block_ctx
             if block > 0:
-                assert prev_l == 0
                 key = key[:, prev_l : prev_l + block_ctx, :]
                 value = value[:, prev_l : prev_l + block_ctx, :]
             else:
@@ -1118,77 +1106,62 @@ def prev_block_attn(self, query, key, value, sample):
         else:
             query_length = query.shape[1]
             query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
-            key = torch.nn.functional.pad(
-                key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
-            ).view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
-            value = torch.nn.functional.pad(
-                value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :], (0, 0, 0, 0, 1, 0)
-            ).view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            
+            key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :]
+            key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0))
+            key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            
+            value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :]
+            value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0))
+            value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+
             if query_length < seq_len:
                 qb = query_length // block_ctx
                 kb = seq_len // block_ctx
                 seq_len = query_length
-                key = (
-                    key.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:]
-                    .contiguous()
-                    .view(batch_size * qb, block_ctx, embed_dim)
-                )
-                value = (
-                    value.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:]
-                    .contiguous()
-                    .view(batch_size * qb, block_ctx, embed_dim)
-                )
+                key = key.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:]
+                key = key.contiguous().view(batch_size * qb, block_ctx, embed_dim)
+
+                value = value.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:]
+                value = value.contiguous().view(batch_size * qb, block_ctx, embed_dim)
+
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def summary_attn(self, query, key, value, sample):
-        blocks, block_ctx = (
-            self.blocks,
-            self.block_ctx,
-        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        blocks = self.blocks
+        block_ctx = self.block_ctx
         batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
-            key = torch.nn.functional.pad(key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0))
-            value = torch.nn.functional.pad(
-                value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :], (0, 0, 1, 0)
-            )
+            key = key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :]
+            key = torch.nn.functional.pad(key, (0, 0, 1, 0))
+
+            value = value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :]
+            value = torch.nn.functional.pad(value, (0, 0, 1, 0))
             return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
         else:
-            key = torch.nn.functional.pad(
-                key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :], (0, 0, 1, 0)
-            )  # batch_size, blocks, embed_dim
-            value = torch.nn.functional.pad(
-                value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :], (0, 0, 1, 0)
-            )  # batch_size, blocks, embed_dim
+            key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :]
+            key = torch.nn.functional.pad(key, (0, 0, 1, 0))  # batch_size, blocks, embed_dim
+
+            value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :]
+            value = torch.nn.functional.pad(value, (0, 0, 1, 0))  # batch_size, blocks, embed_dim
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def summary_spread_attn(self, query, key, value, sample):
-        blocks, _, spread = (
-            self.blocks,
-            self.block_ctx,
-            self.spread,
-        )  # block_ctx is seq_len // blocks for complete seq_len ie seq_len = n_ctx. Sampling has less l
+        blocks = self.blocks
+        spread = self.spread
+        
         batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
-            assert False, "Not yet implemented"
-            # key = torch.nn.functional.pad(k,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, embed_dim)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, embed_dim)
-            # value = torch.nn.functional.pad(value,(0,0,block_ctx,(-l)%block_ctx)).view(batch_size, -1, block_ctx, embed_dim)[:,:-1,-spread:,:].contiguous().view(batch_size, -1, embed_dim)
-            # return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+            raise NotImplementedError
         else:
-            key = (
-                torch.nn.functional.pad(
-                    key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :], (0, 0, 0, 0, 1, 0)
-                )
-                .contiguous()
-                .view(batch_size, blocks * spread, embed_dim)
-            )  # batch_size, blocks * spread, embed_dim
-            value = (
-                torch.nn.functional.pad(
-                    value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :],
-                    (0, 0, 0, 0, 1, 0),
-                )
-                .contiguous()
-                .view(batch_size, blocks * spread, embed_dim)
-            )  # batch_size, blocks * spread, embed_dim
+            key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :]
+            key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0)).contiguous()
+            key = key.view(batch_size, blocks * spread, embed_dim)
+            
+            value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :]
+            value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0)).contiguous()
+            value = value.view(batch_size, blocks * spread, embed_dim)
+            
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def prime_attn(self, query, key, value, sample):
@@ -1197,15 +1170,11 @@ def prime_attn(self, query, key, value, sample):
         value = value[:, :lyric_enc_len]
         return self.dense_attn(query, key, value, sample)
 
-    def decode_attn(self, query, key, value, sample):
-        assert (
-            key.shape[1] == value.shape[1] == self.encoder_dims
-        ), f"k: {key.shape}, v: {value.shape}, enc_dims: {self.encoder_dims}"
-        return self.dense_attn(query, key, value, sample)
-
     def factored_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
-        assert lyric_encoder_states is None
+        if lyric_encoder_states is not None: 
+            raise TypeError("lyric_encoder_states should be None")
+
         query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
             self.sample_t += curr_ctx
@@ -1226,7 +1195,8 @@ def factored_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
 
     def prime_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
-        assert lyric_encoder_states is None
+        if lyric_encoder_states is not None: 
+            raise TypeError("lyric_encoder_states should be None")
         query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
             if self._cache_len() < self._lyric_enc_len:
@@ -1447,25 +1417,7 @@ def __init__(
         res_scale = 1.0 / n_depth if res_scale else 1.0
 
         # Orders of attn_func
-        attn_func = {
-            0: lambda d: 0,  # Complete dense attn
-            1: lambda d: [1, 2][d % 2],  # Alternate row and column attn
-            2: lambda d: [1, 2, 3][d % 3],  # Alternate row, column and previous row attn
-            3: lambda d: [1, 4][d % 2],  # Alternate row and last column
-            4: lambda d: [1, 5][d % 2],  # Alternate row and last k columns
-            5: lambda d: [1, 4, 1, 1][d % 4],  # Alternate row, last column, row, row
-            6: lambda d: [1, 2, 3, 6][d % 4],
-            7: lambda d: [*[1, 2, 3] * 5, 6][d % 16],
-            8: lambda d: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][d % 10],  # Used by separated_enc_dec model with lyrics
-            9: lambda d: [1, 2, 3, 0][d % 4],
-            10: lambda d: [*[1, 2, 3, 1, 2, 3, 1, 2, 3], *[1, 2, 3, 1, 2, 3, 1, 2, 3, 6] * 7][
-                d % 79
-            ],  # Used by large separated_enc_dec model with lyrics
-            11: lambda d: [6, 6, 0][d % 3] if d % 16 == 15 else [1, 2, 3][d % 3],
-            12: lambda d: [7, 7, 0][d % 3]
-            if d % 16 == 15
-            else [1, 2, 3][d % 3],  # Used by single_enc_dec model with lyrics
-        }[attn_order]
+        attn_func = self.get_attn_func(attn_order)
 
         def attn_block(d):
             return JukeboxBlock(
@@ -1495,6 +1447,43 @@ def attn_block(d):
 
         self.saved_attn_weights = []
 
+
+    def get_attn_func(self, attn_order : int):
+        """
+        Get the correct attention order pattern.  
+        """
+        
+        if attn_order == 0 : 
+            attn_func = lambda layer : 0
+        if attn_order == 1 : 
+            attn_func = lambda layer: [1, 2][layer % 2]
+        elif attn_order == 2 : 
+            attn_func = lambda layer: [1, 2, 3][layer % 3]  # Alternate row, column and previous row attn
+        elif attn_order == 3 : 
+            attn_func = lambda layer: [1, 4][layer % 2]  # Alternate row and last column
+        elif attn_order == 4 : 
+            attn_func = lambda layer: [1, 5][layer % 2]  # Alternate row and last k columns
+        elif attn_order == 5 : 
+            attn_func = lambda layer: [1, 4, 1, 1][layer % 4]  # Alternate row, last column, row, row
+        elif attn_order == 6 : 
+            attn_func = lambda layer: [1, 2, 3, 6][layer % 4]
+        elif attn_order == 7 : 
+            attn_func = lambda layer: [*[1, 2, 3] * 5, 6][layer % 16]
+        elif attn_order == 8 : 
+            attn_func = lambda layer: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][layer % 10]  # Used by separated_enc_dec model with lyrics
+        elif attn_order == 9 : 
+            attn_func = lambda layer: [1, 2, 3, 0][layer % 4]
+        elif attn_order == 10 : 
+            attn_func = lambda layer: [*[1, 2, 3, 1, 2, 3, 1, 2, 3], *[1, 2, 3, 1, 2, 3, 1, 2, 3, 6] * 7][
+                layer % 79
+            ]  # Used by large separated_enc_dec model with lyrics
+        elif attn_order == 11 : 
+            attn_func = lambda layer: [6, 6, 0][layer % 3] if layer % 16 == 15 else [1, 2, 3][layer % 3]
+        elif attn_order == 12 : 
+            attn_func = lambda layer: [7, 7, 0][layer % 3] if layer % 16 == 15 else [1, 2, 3][layer % 3]  # Used by single_enc_dec model with lyrics
+        
+        return attn_func
+
     def set_record_attn(self, record_attn):
         """
         Arguments:
@@ -1510,19 +1499,13 @@ def _should_record_attn(layer_idx):
 
         for i, layer in enumerate(self._attn_mods):
             layer.attn.record_attn = _should_record_attn(i)
-        if record_attn:
-            assert self.saved_attn_weights == []
-            for layer in self._attn_mods:
-                assert layer.attn.w is None
-        else:
+
+        if not record_attn:
             self.saved_attn_weights = []
             for layer in self._attn_mods:
                 layer.attn.w = None
 
-    def forward(self, hidden_states, lyric_encoder_states=None, sample=False, fp16=False, fp16_out=False):
-        if fp16:
-            hidden_states = hidden_states.half()
-
+    def forward(self, hidden_states, lyric_encoder_states=None, sample=False):
         # Blocks
         for i, attn_layer in enumerate(self._attn_mods):
             if attn_layer.attn_func == 6:  # attend to the lyrics
@@ -1531,8 +1514,6 @@ def forward(self, hidden_states, lyric_encoder_states=None, sample=False, fp16=F
                 hidden_states = attn_layer(hidden_states, lyric_encoder_states=None, sample=sample)
             if attn_layer.attn.record_attn:
                 self.saved_attn_weights.append(attn_layer.attn.c_attn.weight)
-        if not fp16_out:
-            hidden_states = hidden_states.float()
         return hidden_states
 
     def del_cache(self):
@@ -1669,7 +1650,6 @@ def forward(
         audio_conditioning=None,
         metadata_conditioning=None,
         lyric_encoder_states=None,
-        fp16=False,
         get_preds=False,
         get_acts=False,
         get_sep_loss=False,
@@ -1699,9 +1679,7 @@ def forward(
             self.embed_tokens_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning
         )  # Pos emb and dropout
 
-        hidden_states = self.transformer(
-            hidden_states, lyric_encoder_states=lyric_encoder_states, fp16=fp16, fp16_out=fp16
-        )  # Transformer
+        hidden_states = self.transformer(hidden_states, lyric_encoder_states=lyric_encoder_states)  # Transformer
         if self.add_cond_after_transformer:  # Piped doesnt add x_cond
             hidden_states = hidden_states + audio_conditioning
 
@@ -1756,7 +1734,6 @@ def sample(
         audio_conditioning=None,
         metadata_conditioning=None,
         lyric_encoder_states=None,
-        fp16=False,
         temp=1.0,
         top_k=0,
         top_p=0.0,
@@ -1783,9 +1760,7 @@ def sample(
                     sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning
                 )
 
-                hidden_states = self.transformer(
-                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16, fp16_out=fp16
-                )
+                hidden_states = self.transformer(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True)
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
                 hidden_states = self.fc_proj_out(hidden_states)  # Predictions
@@ -1822,7 +1797,6 @@ def primed_sample(
         audio_conditioning=None,
         metadata_conditioning=None,
         lyric_encoder_states=None,
-        fp16=False,
         temp=1.0,
         top_k=0,
         top_p=0.0,
@@ -1870,7 +1844,7 @@ def primed_sample(
                 del conds_prime
                 if not get_preds:
                     del cond_prime
-                x_prime = self.transformer(x_prime, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16)
+                x_prime = self.transformer(x_prime, lyric_encoder_states=lyric_encoder_states, sample=True)
 
                 if get_preds:
                     if self.add_cond_after_transformer:
@@ -1895,7 +1869,7 @@ def primed_sample(
                 )
 
                 hidden_states = self.transformer(
-                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True, fp16=fp16, fp16_out=fp16
+                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True
                 )  # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
@@ -2000,23 +1974,20 @@ def __init__(self, n_time, embed_dim, range, out_width, init_scale, clamp=False)
 
     def forward(self, pos_start, pos_end=None):
         # Check if [pos_start,pos_end] in [pos_min, pos_max)
-        assert len(pos_start.shape) == 2, f"Expected shape with 2 dims, got {pos_start.shape}"
-        assert (self.pos_min <= pos_start).all() and (
-            pos_start < self.pos_max
-        ).all(), f"Range is [{self.pos_min},{self.pos_max}), got {pos_start}"
+        if not len(pos_start.shape) == 2:
+            raise TypeError(f"Expected shape with 2 dims, got {pos_start.shape}")
+        if not (self.pos_min <= pos_start).all() and (pos_start < self.pos_max).all() :
+            raise TypeError(f"Range is [{self.pos_min},{self.pos_max}), got {pos_start}")
+        
         pos_start = pos_start.float()
         if pos_end is not None:
-            assert len(pos_end.shape) == 2, f"Expected shape with 2 dims, got {pos_end.shape}"
             if self.clamp:
                 pos_end = pos_end.clamp(self.pos_min, self.pos_max)
-            assert (self.pos_min <= pos_end).all() and (
-                pos_end <= self.pos_max
-            ).all(), f"Range is [{self.pos_min},{self.pos_max}), got {pos_end}"
+
             pos_end = pos_end.float()
         # Interpolate so that [pos_start, ..., pos_end] <-> position tensor of length n_ctx
         n_time = self.n_time
         if n_time != 1:
-            assert pos_end is not None
             interpolation = (
                 torch.arange(0, n_time, dtype=torch.float, device=pos_start.device).view(1, n_time) / n_time
             )
@@ -2081,7 +2052,6 @@ def forward(self, metadata):
         mask = (genre >= 0).float().unsqueeze(2)
         genre_emb = (self.bow_genre_emb(genre.clamp(0)) * mask).sum(dim=1, keepdim=True)
         start_emb = genre_emb + artist_emb
-        # assert_shape(start_emb, (batch_size, 1, self.out_width))
 
         # Pos embedding of length n_ctx
         if self.include_time_signal:
@@ -2137,11 +2107,8 @@ def rescale(music_tokens_shape):
 
         self.music_tokens_shapes = music_tokens_shapes
         self.levels = len(self.music_tokens_shapes)
-
-        self.music_tokens_shape = self.music_tokens_shapes[level]
-
         self.level = level
-
+        self.music_tokens_shape = self.music_tokens_shapes[level]
         self.latent_dim = config.prior_latent_dim
 
         prior_kwargs = dict(
@@ -2463,7 +2430,6 @@ def sample(
         music_tokens=None,
         music_tokens_conds=None,
         metadata=None,
-        fp16=False,
         temp=1.0,
         top_k=0,
         top_p=0.0,
@@ -2496,7 +2462,6 @@ def sample(
                     music_tokens,
                     audio_conditioning,
                     metadata_conditioning,
-                    fp16=fp16,
                     temp=temp,
                     top_k=top_k,
                     top_p=top_p,
@@ -2505,14 +2470,13 @@ def sample(
                 )
                 music_tokens = self.prior_postprocess(tokens)
             else:
-                lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens, fp16=fp16, sample=True)
+                lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens, sample=True)
                 if no_past_context:
                     music_tokens = self.prior.sample(
                         n_samples,
                         audio_conditioning,
                         metadata_conditioning,
                         lyric_encoder_states,
-                        fp16=fp16,
                         temp=temp,
                         top_k=top_k,
                         top_p=top_p,
@@ -2525,7 +2489,6 @@ def sample(
                         audio_conditioning,
                         metadata_conditioning,
                         lyric_encoder_states,
-                        fp16=fp16,
                         temp=temp,
                         top_k=top_k,
                         top_p=top_p,
@@ -2534,7 +2497,7 @@ def sample(
                     )
         return music_tokens
 
-    def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
+    def get_lyric_encoder_states(self, lyric_tokens, sample=False):
         """
         Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
         the lyric encoder.
@@ -2542,7 +2505,7 @@ def get_lyric_encoder_states(self, lyric_tokens, fp16=False, sample=False):
         if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
             if sample:
                 self.lyric_encoder = self.lyric_encoder.to(lyric_tokens.device)
-            lyric_acts = self.lyric_encoder(lyric_tokens, None, None, None, fp16=fp16)
+            lyric_acts = self.lyric_encoder(lyric_tokens, None, None, None)
             lyric_acts = self.lyric_encoder.proj_in(lyric_acts)
             lyric_encoder_states = self.lyric_encoder.final_layer_norm(lyric_acts)
             if sample:
@@ -2566,7 +2529,7 @@ def get_lyric_enc_loss(self, lyric_encoder_states, target_lyrics):
         return lyric_enc_loss
 
     def forward_tokens(
-        self, music_tokens, music_tokens_conds=[], metadata=None, fp16=False, get_preds=False, get_attn_weights=False
+        self, music_tokens, music_tokens_conds=[], metadata=None, get_preds=False, get_attn_weights=False
     ):
         """
         Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the
@@ -2586,17 +2549,16 @@ def forward_tokens(
                 [lyric_tokens, music_tokens], [None, audio_conditioning]
             )
             (lyric_enc_loss, gen_loss), preds = self.prior(
-                tokens, audio_conditioning, metadata_conditioning, fp16=fp16, get_sep_loss=True, get_preds=get_preds
+                tokens, audio_conditioning, metadata_conditioning, get_sep_loss=True, get_preds=get_preds
             )
         else:
-            lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens, fp16=fp16)
+            lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens)
             lyric_enc_loss = self.get_lyric_enc_loss(lyric_encoder_states, lyric_tokens)
             gen_loss, preds = self.prior(
                 music_tokens,
                 audio_conditioning,
                 metadata_conditioning,
                 lyric_encoder_states,
-                fp16=fp16,
                 get_preds=get_preds,
             )
         loss = (self.lyric_enc_loss_fraction * lyric_enc_loss * self.lyrics_enc_loss_dims / self.total_loss_dims) + (
@@ -2616,14 +2578,13 @@ def forward_tokens(
         else:
             return loss, metrics
 
-    def forward(self, hidden_states, metadata=None, fp16=False, decode=False, get_preds=False):
+    def forward(self, hidden_states, metadata=None, decode=False, get_preds=False):
         batch_size = hidden_states.shape[0]
         music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size)
         loss, metrics = self.forward_tokens(
             music_tokens=music_tokens,
             music_tokens_conds=music_tokens_conds,
             metadata=metadata,
-            fp16=fp16,
             get_preds=get_preds,
         )
         if decode:
@@ -2752,8 +2713,6 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         # set metadata offset, sample_length and lyrics tokens
         metadata = prior.get_metadata(labels, start, self.total_length, offset)
 
-        # gc.collect()
-        # torch.cuda.empty_cache()
         max_batch_size = sampling_kwargs["max_batch_size"]
         del sampling_kwargs["max_batch_size"]
 
@@ -2814,7 +2773,6 @@ def _sample(
         offset=0,
         save_results=True,
         sample_length=None,
-        fp16=False,
     ) -> List[torch.LongTensor]:
         """
         Core sampling function used to generate music tokens. Iterates over the provided list of levels, while saving
@@ -2856,9 +2814,6 @@ def _sample(
                 time.
             sample_length (`int`, *optional*, defaults to None):
                 Desired lenght of the generation in samples.
-            fp16 (`bool`, *optional*, defaults to False):
-                Whether or not to cast the hidden states to float16 in the attention layer. Defaults to False.
-
 
         Example:
         ```python
@@ -2891,21 +2846,18 @@ def _sample(
         sampling_kwargs = [
             dict(
                 temp=0.99,
-                fp16=fp16,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
             ),
             dict(
                 temp=0.99,
-                fp16=fp16,
                 max_batch_size=lower_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
             ),
             dict(
                 temp=sampling_temperature,
-                fp16=fp16,
                 max_batch_size=max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -2945,9 +2897,9 @@ def _sample(
                 save_wav(logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate)
                 if compute_alignments and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
                     with torch.no_grad():
-                        alignments = get_alignment(music_tokens, labels[-1], self.priors[-1], fp16, self.config)
+                        alignments = get_alignment(music_tokens, labels[-1], self.priors[-1], self.config)
                     torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")
-                    # disabled saving to html, as it requires too many dependencies.
+
         return music_tokens
 
     @add_start_docstrings(

From 4307ee633ca544326c74bca7215ee305c78adf9a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 21 Oct 2022 13:14:56 +0000
Subject: [PATCH 138/196] add fp16 support and test

---
 .../models/jukebox/modeling_jukebox.py        | 129 +++++++++---------
 tests/models/jukebox/test_modeling_jukebox.py |  25 ++++
 2 files changed, 87 insertions(+), 67 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 64c36f9f857c2..92b627d051abc 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -147,9 +147,7 @@ def get_alignment(music_tokens, labels, prior, config):
         for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
             tokens_i = tokens_i.to("cuda")
             metadata_i = metadata_i.to("cuda")
-            w_hop = prior.forward_tokens(
-                tokens_i[:, start:end], [], metadata_i, get_attn_weights=attn_layers
-            )
+            w_hop = prior.forward_tokens(tokens_i[:, start:end], [], metadata_i, get_attn_weights=attn_layers)
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
         w = torch.cat(w_hops, dim=0)
@@ -686,14 +684,15 @@ def forward(self, input_audio):
 )
 class JukeboxVQVAE(PreTrainedModel):
     config_class = JukeboxConfig
+
     def __init__(self, config):
         super().__init__(config)
         if not config.sample_length:
             downsamples = [stride**down for stride, down in zip(config.vqvae_strides_t, config.vqvae_down_t)]
             top_raw_to_tokens = np.prod(downsamples)
             config.sample_length = (
-                (config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens) * top_raw_to_tokens
-            )
+                config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens
+            ) * top_raw_to_tokens
             config.sample_length = config.sample_length.astype(int)
 
         input_shape = (config.sample_length, 1)
@@ -860,9 +859,10 @@ def forward(self, raw_audio):
         ```python
         >>> from transformers import JukeboxVQVAE, set_seed
         >>> import torch
+
         >>> model = JukeboxVQVAE.from_pretrained("ArthurZ/vqvae-dummy").eval()
         >>> set_seed(0)
-        >>> zs = [torch.randint(100,(4,1))]
+        >>> zs = [torch.randint(100, (4, 1))]
         >>> model.decode(zs).shape
         torch.Size([4, 8, 1])
         ```"""
@@ -955,13 +955,13 @@ def __init__(
 
         # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
         self.attn_func = attn_func
-        if attn_func == 6 : 
-            self.qkv = self.decode_qkv 
+        if attn_func == 6:
+            self.qkv = self.decode_qkv
         elif attn_func == 7:
             self.qkv = self.prime_qkv
         else:
             self.qkv = self.factored_qkv
-            
+
         self.attn, self.attn_mask = {
             0: (self.dense_attn, "autoregressive"),  # Attend to all positions
             1: (self.block_attn, "autoregressive"),  # Attend to your block
@@ -1087,7 +1087,7 @@ def transpose_block_attn(self, query, key, value, sample):
             block_attn = block_attn.view(batch_size, block_ctx, query_length // block_ctx, embed_dim)
             block_attn = block_attn.transpose(1, 2).contiguous()
             block_attn = block_attn.view(batch_size, query_length, embed_dim)
-            
+
             return block_attn
 
     def prev_block_attn(self, query, key, value, sample):
@@ -1106,11 +1106,11 @@ def prev_block_attn(self, query, key, value, sample):
         else:
             query_length = query.shape[1]
             query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
-            
+
             key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :]
             key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0))
             key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
-            
+
             value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :]
             value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0))
             value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
@@ -1149,7 +1149,7 @@ def summary_attn(self, query, key, value, sample):
     def summary_spread_attn(self, query, key, value, sample):
         blocks = self.blocks
         spread = self.spread
-        
+
         batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
         if sample:
             raise NotImplementedError
@@ -1157,11 +1157,11 @@ def summary_spread_attn(self, query, key, value, sample):
             key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :]
             key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0)).contiguous()
             key = key.view(batch_size, blocks * spread, embed_dim)
-            
+
             value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :]
             value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0)).contiguous()
             value = value.view(batch_size, blocks * spread, embed_dim)
-            
+
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def prime_attn(self, query, key, value, sample):
@@ -1172,7 +1172,7 @@ def prime_attn(self, query, key, value, sample):
 
     def factored_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
-        if lyric_encoder_states is not None: 
+        if lyric_encoder_states is not None:
             raise TypeError("lyric_encoder_states should be None")
 
         query, key, value = hidden_states.chunk(3, dim=2)
@@ -1195,7 +1195,7 @@ def factored_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
 
     def prime_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
-        if lyric_encoder_states is not None: 
+        if lyric_encoder_states is not None:
             raise TypeError("lyric_encoder_states should be None")
         query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
@@ -1419,7 +1419,7 @@ def __init__(
         # Orders of attn_func
         attn_func = self.get_attn_func(attn_order)
 
-        def attn_block(d):
+        def attn_block(depth):
             return JukeboxBlock(
                 width=width,
                 n_ctx=n_ctx,
@@ -1429,12 +1429,12 @@ def attn_block(d):
                 afn=afn,
                 scale=scale,
                 mask=mask,
-                zero_out=zero_out if attn_func(d) != 6 else True,
+                zero_out=zero_out if attn_func(depth) != 6 else True,
                 init_scale=init_scale,
                 res_scale=res_scale,
                 m_attn=m_attn,
                 m_mlp=m_mlp,
-                attn_func=attn_func(d),
+                attn_func=attn_func(depth),
                 blocks=blocks,
                 spread=spread,
                 encoder_dims=encoder_dims,
@@ -1442,47 +1442,34 @@ def attn_block(d):
             )
 
         self._attn_mods = nn.ModuleList()
-        for d in range(n_depth):
-            self._attn_mods.append(attn_block(d))
+        for depth in range(n_depth):
+            self._attn_mods.append(attn_block(depth))
 
         self.saved_attn_weights = []
 
-
-    def get_attn_func(self, attn_order : int):
+    def get_attn_func(self, attn_order: int):
         """
-        Get the correct attention order pattern.  
+        Get the correct attention order pattern.
         """
-        
-        if attn_order == 0 : 
-            attn_func = lambda layer : 0
-        if attn_order == 1 : 
-            attn_func = lambda layer: [1, 2][layer % 2]
-        elif attn_order == 2 : 
-            attn_func = lambda layer: [1, 2, 3][layer % 3]  # Alternate row, column and previous row attn
-        elif attn_order == 3 : 
-            attn_func = lambda layer: [1, 4][layer % 2]  # Alternate row and last column
-        elif attn_order == 4 : 
-            attn_func = lambda layer: [1, 5][layer % 2]  # Alternate row and last k columns
-        elif attn_order == 5 : 
-            attn_func = lambda layer: [1, 4, 1, 1][layer % 4]  # Alternate row, last column, row, row
-        elif attn_order == 6 : 
-            attn_func = lambda layer: [1, 2, 3, 6][layer % 4]
-        elif attn_order == 7 : 
-            attn_func = lambda layer: [*[1, 2, 3] * 5, 6][layer % 16]
-        elif attn_order == 8 : 
-            attn_func = lambda layer: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][layer % 10]  # Used by separated_enc_dec model with lyrics
-        elif attn_order == 9 : 
-            attn_func = lambda layer: [1, 2, 3, 0][layer % 4]
-        elif attn_order == 10 : 
-            attn_func = lambda layer: [*[1, 2, 3, 1, 2, 3, 1, 2, 3], *[1, 2, 3, 1, 2, 3, 1, 2, 3, 6] * 7][
-                layer % 79
-            ]  # Used by large separated_enc_dec model with lyrics
-        elif attn_order == 11 : 
-            attn_func = lambda layer: [6, 6, 0][layer % 3] if layer % 16 == 15 else [1, 2, 3][layer % 3]
-        elif attn_order == 12 : 
-            attn_func = lambda layer: [7, 7, 0][layer % 3] if layer % 16 == 15 else [1, 2, 3][layer % 3]  # Used by single_enc_dec model with lyrics
-        
-        return attn_func
+        mapping = {
+            0: lambda layer: 0,
+            1: lambda layer: [1, 2][layer % 2],
+            2: lambda layer: [1, 2, 3][layer % 3],  # Alternate row, column and previous row attn
+            3: lambda layer: [1, 4][layer % 2],  # Alternate row and last column
+            4: lambda layer: [1, 5][layer % 2],  # Alternate row and last k columns
+            5: lambda layer: [1, 4, 1, 1][layer % 4],  # Alternate row, last column, row, row
+            6: lambda layer: [1, 2, 3, 6][layer % 4],
+            7: lambda layer: [*[1, 2, 3] * 5, 6][layer % 16],
+            8: lambda layer: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][layer % 10],  # Used by separated_enc_dec model with lyrics
+            9: lambda layer: [1, 2, 3, 0][layer % 4],
+            # Used by large separated_enc_dec model with lyrics
+            10: lambda layer: [*[1, 2, 3, 1, 2, 3, 1, 2, 3], *[1, 2, 3, 1, 2, 3, 1, 2, 3, 6] * 7][layer % 79],
+            11: lambda layer: [6, 6, 0][layer % 3] if layer % 16 == 15 else [1, 2, 3][layer % 3],
+            # Used by single_enc_dec model with lyrics
+            12: lambda layer: [7, 7, 0][layer % 3] if layer % 16 == 15 else [1, 2, 3][layer % 3],
+        }
+
+        return mapping[attn_order]
 
     def set_record_attn(self, record_attn):
         """
@@ -1502,8 +1489,6 @@ def _should_record_attn(layer_idx):
 
         if not record_attn:
             self.saved_attn_weights = []
-            for layer in self._attn_mods:
-                layer.attn.w = None
 
     def forward(self, hidden_states, lyric_encoder_states=None, sample=False):
         # Blocks
@@ -1663,7 +1648,11 @@ def forward(
 
         batch_size = tokens.shape[0]
         if not self.audio_conditioning:
-            audio_conditioning = torch.zeros((batch_size, 1, self.width), device=tokens.device, dtype=torch.float)
+            audio_conditioning = torch.zeros(
+                (batch_size, 1, self.width),
+                device=tokens.device,
+                dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype,
+            )
 
         target = tokens  # Target
         hidden_states = self.embed_tokens(tokens)  # music_tokens embedding
@@ -1712,7 +1701,9 @@ def forward(
 
     def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning):
         if sample_t == 0:
-            hidden_states = torch.empty(n_samples, 1, self.width).to(audio_conditioning.device)
+            hidden_states = torch.empty(n_samples, 1, self.width, dtype=self.embed_tokens.weight.dtype).to(
+                audio_conditioning.device
+            )
             if self.metadata_conditioning:
                 hidden_states[:, 0] = metadata_conditioning.view(n_samples, self.width)
             else:
@@ -1744,9 +1735,9 @@ def sample(
             sample_tokens = self.input_dims
 
         if not self.audio_conditioning:
-            audio_conditioning = torch.zeros((n_samples, 1, self.width), dtype=torch.float).to(
-                "cpu" if torch.cuda.is_available() else "cpu"
-            )
+            audio_conditioning = torch.zeros(
+                (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
+            ).to("cpu" if torch.cuda.is_available() else "cpu")
 
         with torch.no_grad():
             sampled_tokens, tokens = [], None
@@ -1814,7 +1805,9 @@ def primed_sample(
         sampled_audio = list(sampled_audio)
 
         if not self.audio_conditioning:
-            audio_conditioning = torch.zeros((n_samples, 1, self.width), dtype=torch.float).to(hidden_states.device)
+            audio_conditioning = torch.zeros(
+                (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
+            ).to(hidden_states.device)
 
         with torch.no_grad():
             if get_preds:
@@ -1976,9 +1969,9 @@ def forward(self, pos_start, pos_end=None):
         # Check if [pos_start,pos_end] in [pos_min, pos_max)
         if not len(pos_start.shape) == 2:
             raise TypeError(f"Expected shape with 2 dims, got {pos_start.shape}")
-        if not (self.pos_min <= pos_start).all() and (pos_start < self.pos_max).all() :
+        if not (self.pos_min <= pos_start).all() and (pos_start < self.pos_max).all():
             raise TypeError(f"Range is [{self.pos_min},{self.pos_max}), got {pos_start}")
-        
+
         pos_start = pos_start.float()
         if pos_end is not None:
             if self.clamp:
@@ -2342,7 +2335,9 @@ def prior_preprocess(self, tokens, conds):
             cond, dims = conds[i], self.prior_dims[i]
             if cond is None:
                 conds[i] = torch.zeros(
-                    (batch_size, dims, self.prior_width), dtype=torch.float, device=tokens[0].device
+                    (batch_size, dims, self.prior_width),
+                    dtype=self.prior.transformer._attn_mods[0].mlp.c_fc.weight.dtype,
+                    device=tokens[0].device,
                 )
 
         return torch.cat(tokens, dim=1), torch.cat(conds, dim=1)
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index af61a6307064e..6ac8ed0be7d7f 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -333,3 +333,28 @@ def test_slow_sampling(self):
         set_seed(0)
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
+
+    @slow
+    def test_fp16_slow_sampling(self):
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval().half().to("cuda").half()
+        labels = [i.cuda() for i in self.prepare_inputs(self.model_id)]
+
+        set_seed(0)
+        zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+
+        set_seed(0)
+        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+
+        set_seed(0)
+        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
+
+        # Accumulation of errors might give :
+        # [ 491, 1755,   34, 1613, 1755,  417,  992, 1613,  222,  842, 1353, 1613,
+        #  808,  616,   34, 1613,  808,  616,   34, 1613,  222,  616,  290,  842,
+        #  222,  616, 1372,  114, 1353,  114,  591,  842, 1353, 1613,  307, 1756,
+        # 1353,  114,  591, 1268,  591, 1613,   34, 1268,  591, 1613,   34, 1061,
+        #  591,  114,  185,   89,   34, 1613,  185,   89,  591,  632,  222,   89]

From 6c2291233bae57aa195d2075d35f88b74860262a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 21 Oct 2022 13:17:03 +0000
Subject: [PATCH 139/196] clean fp16 test

---
 tests/models/jukebox/test_modeling_jukebox.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 6ac8ed0be7d7f..030dc528a48c9 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -343,18 +343,3 @@ def test_fp16_slow_sampling(self):
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
-
-        set_seed(0)
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
-
-        # Accumulation of errors might give :
-        # [ 491, 1755,   34, 1613, 1755,  417,  992, 1613,  222,  842, 1353, 1613,
-        #  808,  616,   34, 1613,  808,  616,   34, 1613,  222,  616,  290,  842,
-        #  222,  616, 1372,  114, 1353,  114,  591,  842, 1353, 1613,  307, 1756,
-        # 1353,  114,  591, 1268,  591, 1613,   34, 1268,  591, 1613,   34, 1061,
-        #  591,  114,  185,   89,   34, 1613,  185,   89,  591,  632,  222,   89]

From 34948de210465ffb43ca3e58c482762bb5b1e265 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 21 Oct 2022 16:24:35 +0000
Subject: [PATCH 140/196] more cleaning

---
 .../models/jukebox/modeling_jukebox.py        | 82 +++++++------------
 1 file changed, 29 insertions(+), 53 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 92b627d051abc..4c1bffa5e0ce3 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -16,7 +16,6 @@
 
 import math
 import os
-import time
 from typing import List
 
 import numpy as np
@@ -178,9 +177,8 @@ def get_alignment(music_tokens, labels, prior, config):
     return alignments
 
 
-def save_wav(fname, lvl, metas, aud, sampling_rate):
+def save_temp_audio(fname, lvl, metas, aud, sampling_rate):
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
-    return
     for i in list(range(aud.shape[0])):
         if metas is not None:
             artists, genres, lyrics = list(metas)[i].values()
@@ -1615,12 +1613,6 @@ def __init__(
                 self.fc_proj_out.weight = self.embed_tokens.weight
             self.loss = torch.nn.CrossEntropyLoss()
 
-    def preprocess(self, tokens):
-        # Input: hidden_states is NHWC and uint8. Converted to NL and long
-        # Can include stuff like bitpacking, reordering here.
-        batch_size = tokens.shape[0]
-        return tokens.view(batch_size, -1).long()
-
     def postprocess(self, tokens, sample_tokens=None):
         # Convert back from NL and long to NHWC
         batch_size = tokens.shape[0]
@@ -1643,10 +1635,10 @@ def forward(
         - tokens : composed of both music tokens and lyrics tokens or just music tokens
         """
         # Preprocess.
+        batch_size = tokens.shape[0]
         with torch.no_grad():
-            tokens = self.preprocess(tokens)
+            tokens = tokens.view(batch_size, -1).long()
 
-        batch_size = tokens.shape[0]
         if not self.audio_conditioning:
             audio_conditioning = torch.zeros(
                 (batch_size, 1, self.width),
@@ -1798,8 +1790,9 @@ def primed_sample(
         if sample_tokens is None:
             sample_tokens = self.input_dims
         # Preprocess.
+        batch_size = hidden_states.shape[0]
         with torch.no_grad():
-            hidden_states = self.preprocess(hidden_states)
+            hidden_states = hidden_states.view(batch_size, -1).long()
 
         sampled_audio = torch.split(hidden_states, 1, dim=1)
         sampled_audio = list(sampled_audio)
@@ -1872,12 +1865,12 @@ def primed_sample(
                 # Adjust logits
                 hidden_states = hidden_states / temp
                 hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
-                hidden_states = torch.distributions.Categorical(
+                tokens = torch.distributions.Categorical(
                     logits=hidden_states
                 ).sample()  # Sample and replace hidden_states
-                sampled_audio.append(hidden_states.clone())
+                sampled_audio.append(tokens.clone())
 
-            del hidden_states
+            del tokens
             self.transformer.del_cache()
 
             hidden_states = torch.cat(sampled_audio, dim=1)
@@ -2309,14 +2302,12 @@ def get_music_tokens_conds(self, music_tokens, start, end):
         Extracts current level's conditioning music tokens.
         """
         if self.level != self.levels - 1:
-            music_tokens_cond = music_tokens[self.level + 1][
-                :, start // self.cond_downsample : end // self.cond_downsample
-            ]
+            music_tokens_cond = music_tokens[self.level + 1]
+            music_tokens = music_tokens[:, start // self.cond_downsample : end // self.cond_downsample]
             missing_cond_len = self.n_ctx // self.cond_downsample - music_tokens_cond[-1].shape[-1]
             if missing_cond_len > 0:
-                music_tokens_cond = torch.cat(
-                    (music_tokens_cond, torch.zeros(1, missing_cond_len).to(music_tokens_cond.device)), dim=-1
-                ).long()
+                init_cond = torch.zeros(1, missing_cond_len).to(music_tokens_cond.device)
+                music_tokens_cond = torch.cat((music_tokens_cond, init_cond), dim=-1).long()
             music_tokens_conds = [music_tokens_cond]
         else:
             music_tokens_conds = None
@@ -2708,8 +2699,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         # set metadata offset, sample_length and lyrics tokens
         metadata = prior.get_metadata(labels, start, self.total_length, offset)
 
-        max_batch_size = sampling_kwargs["max_batch_size"]
-        del sampling_kwargs["max_batch_size"]
+        max_batch_size = sampling_kwargs.pop("max_batch_size")
 
         music_tokens_list = self.split_batch(previous_sampled_tokens, n_samples, max_batch_size)
         music_tokens_conds_list = self.split_batch(music_tokens_conds, n_samples, max_batch_size)
@@ -2728,8 +2718,6 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
             tokens.append(tokens_i)
         sampled_tokens = torch.cat(tokens, dim=0)
 
-        sampling_kwargs["max_batch_size"] = max_batch_size
-
         # Update music_tokens with new sample
         music_tokens_new = sampled_tokens[:, -new_tokens:]
         music_tokens[level] = torch.cat([music_tokens[level], music_tokens_new], dim=1)
@@ -2838,42 +2826,28 @@ def _sample(
                 int(sample_length_in_seconds * self.config.sampling_rate) // top_prior.raw_to_tokens
             ) * top_prior.raw_to_tokens
 
-        sampling_kwargs = [
-            dict(
-                temp=0.99,
-                max_batch_size=lower_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=sample_tokens,
-            ),
-            dict(
-                temp=0.99,
-                max_batch_size=lower_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=sample_tokens,
-            ),
-            dict(
-                temp=sampling_temperature,
-                max_batch_size=max_batch_size,
-                chunk_size=chunk_size,
-                sample_tokens=sample_tokens,
-            ),
-        ]
-        self.start_time = time.strftime("%Y-%m-%d-%Hh%M")
         if sample_levels is None:
             sample_levels = range(len(self.priors))
 
-        self.total_length = total_length  # total length of the signal, might be bit different
+        self.total_length = (
+            total_length  # total length of the signal, might be bit different from the actual generated length
+        )
         for level in reversed(sample_levels):
+            sampling_kwargs = dict(
+                temp=0.99 if level == 0 else sampling_temperature,
+                max_batch_size=lower_batch_size if level != sample_levels else max_batch_size,
+                chunk_size=chunk_size,
+                sample_tokens=sample_tokens,
+            )
 
-            # from the actual generated length
             self.priors[level].to(music_tokens[level].device).eval()
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
-            # self.priors[level].total_length = total_length // self.priors[level].raw_to_tokens
+
             total_token_to_sample = total_length // self.priors[level].raw_to_tokens
             hop_length = int(self.config.hop_fraction[-level - 1] * self.priors[level].n_ctx)
 
             music_tokens = self.sample_level(
-                music_tokens, labels[level], offset, sampling_kwargs[level], level, total_token_to_sample, hop_length
+                music_tokens, labels[level], offset, sampling_kwargs, level, total_token_to_sample, hop_length
             )
 
             self.priors[level].to("cpu")
@@ -2883,13 +2857,15 @@ def _sample(
                 raw_audio = self.vqvae.decode(
                     music_tokens[level:], start_level=level, bs_chunks=music_tokens[level].shape[0]
                 )
-            self.vqvae.to("cpu")
+            self.vqvae.to("cpu")  # save RAM
 
             if save_results:
-                logdir = f"{self.start_time}/level_{level}"
+                logdir = f"jukebox/level_{level}"
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
-                save_wav(logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate)
+                save_temp_audio(
+                    logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate
+                )
                 if compute_alignments and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
                     with torch.no_grad():
                         alignments = get_alignment(music_tokens, labels[-1], self.priors[-1], self.config)

From 067baabd72a8b2e9f00d5884265ea112a62f83dd Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 21 Oct 2022 16:43:28 +0000
Subject: [PATCH 141/196] update doc

---
 .../models/jukebox/modeling_jukebox.py        | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 4c1bffa5e0ce3..6487dace2a811 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2765,9 +2765,10 @@ def _sample(
            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
-            labels (`List[torch.Tensor]`):
-                Raw list of tokens. Should be the same length as `self.levels`, the number of priors or the length of
-                `sample_levels`.
+            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, 4 +
+            self.config.max_nb_genre + lyric_sequence_lenght)` :
+                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
+                condition the generation.
             sample_levels (`List[int]`):
                 List of the desired levels at which the sampling will be done. A level is equivalent to the index of
                 the prior in the list of priors
@@ -2798,7 +2799,10 @@ def _sample(
             sample_length (`int`, *optional*, defaults to None):
                 Desired lenght of the generation in samples.
 
+        Returns:
+
         Example:
+
         ```python
         >>> from transformers import JukeboxTokenizer, JukeboxModel, set_seed
         >>> import torch
@@ -2879,8 +2883,13 @@ def _sample(
         upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
         the VQ-VAE decoder to convert the music tokens to raw audio.
 
-        Args:""",
-        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+        Args:
+            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre + lyric_sequence_lenght)` :
+                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
+                condition the generation.
+            n_samples (`int`, *optional*, default to 1) :
+                Number of samples to be generated in parallel.
+        """,
     )
     def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch.LongTensor]:
         """
@@ -2934,7 +2943,6 @@ def continue_sample(self, music_tokens, labels, **sampling_kwargs) -> List[torch
             music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
-
         """,
         JUKEBOX_SAMPLING_INPUT_DOCSTRING,
     )

From da6e27eff9b3111101851f4afe02cac476bf1651 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 21 Oct 2022 17:14:49 +0000
Subject: [PATCH 142/196] update doctests

---
 .../models/jukebox/modeling_jukebox.py        | 38 ++++++++++---------
 .../models/jukebox/tokenization_jukebox.py    |  7 ++--
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 6487dace2a811..5c5a1a8aadbc1 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -863,6 +863,7 @@ def forward(self, raw_audio):
         >>> zs = [torch.randint(100, (4, 1))]
         >>> model.decode(zs).shape
         torch.Size([4, 8, 1])
+
         ```"""
 
         # Encode/Decode
@@ -1776,7 +1777,7 @@ def split_chunks(self, length, chunk_size):
     def primed_sample(
         self,
         n_samples,
-        hidden_states,
+        music_tokens,
         audio_conditioning=None,
         metadata_conditioning=None,
         lyric_encoder_states=None,
@@ -1790,17 +1791,17 @@ def primed_sample(
         if sample_tokens is None:
             sample_tokens = self.input_dims
         # Preprocess.
-        batch_size = hidden_states.shape[0]
+        batch_size = music_tokens.shape[0]
         with torch.no_grad():
-            hidden_states = hidden_states.view(batch_size, -1).long()
+            music_tokens = music_tokens.view(batch_size, -1).long()
 
-        sampled_audio = torch.split(hidden_states, 1, dim=1)
+        sampled_audio = torch.split(music_tokens, 1, dim=1)
         sampled_audio = list(sampled_audio)
 
         if not self.audio_conditioning:
             audio_conditioning = torch.zeros(
                 (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
-            ).to(hidden_states.device)
+            ).to(music_tokens.device)
 
         with torch.no_grad():
             if get_preds:
@@ -1813,15 +1814,15 @@ def primed_sample(
             chunk_sizes = self.split_chunks(len(sampled_audio), chunk_size)
             x_primes = []
             start = 0
-            hidden_states = None
+            music_tokens = None
 
             for current_chunk_size in tqdm(chunk_sizes, desc="Preparing past key value", leave=False):
                 sampled_audio_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
                     x_prime, cond_prime = self.get_emb(
-                        sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning
+                        sample_t, n_samples, music_tokens, audio_conditioning, metadata_conditioning
                     )
-                    hidden_states = sampled_audio[sample_t]
+                    music_tokens = sampled_audio[sample_t]
                     sampled_audio_prime.append(x_prime)
                     conds_prime.append(cond_prime)
                 start = start + current_chunk_size
@@ -1845,13 +1846,13 @@ def primed_sample(
                 x_prime = self.fc_proj_out(x_prime)  # Predictions
                 preds.append(x_prime)
 
-            hidden_states = sampled_audio[-1]
+            music_tokens = sampled_audio[-1]
 
             iter = tqdm(range(len(sampled_audio), sample_tokens))
             for sample_t in iter:
                 iter.set_description(f"Primed sampling {len(iter)} music tokens", refresh=True)
                 hidden_states, cond = self.get_emb(
-                    sample_t, n_samples, hidden_states, audio_conditioning, metadata_conditioning
+                    sample_t, n_samples, music_tokens, audio_conditioning, metadata_conditioning
                 )
 
                 hidden_states = self.transformer(
@@ -1865,22 +1866,22 @@ def primed_sample(
                 # Adjust logits
                 hidden_states = hidden_states / temp
                 hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
-                tokens = torch.distributions.Categorical(
+                music_tokens = torch.distributions.Categorical(
                     logits=hidden_states
                 ).sample()  # Sample and replace hidden_states
-                sampled_audio.append(tokens.clone())
+                sampled_audio.append(music_tokens.clone())
 
-            del tokens
+            del music_tokens
             self.transformer.del_cache()
 
-            hidden_states = torch.cat(sampled_audio, dim=1)
+            music_tokens = torch.cat(sampled_audio, dim=1)
             if get_preds:
                 preds = torch.cat(preds, dim=1)
-            hidden_states = self.postprocess(hidden_states, sample_tokens)
+            music_tokens = self.postprocess(music_tokens, sample_tokens)
         if get_preds:
-            return hidden_states, preds
+            return music_tokens, preds
         else:
-            return hidden_states
+            return music_tokens
 
 
 class JukeboxMusicTokenConditioner(nn.Module):
@@ -2303,7 +2304,7 @@ def get_music_tokens_conds(self, music_tokens, start, end):
         """
         if self.level != self.levels - 1:
             music_tokens_cond = music_tokens[self.level + 1]
-            music_tokens = music_tokens[:, start // self.cond_downsample : end // self.cond_downsample]
+            music_tokens = music_tokens_cond[:, start // self.cond_downsample : end // self.cond_downsample]
             missing_cond_len = self.n_ctx // self.cond_downsample - music_tokens_cond[-1].shape[-1]
             if missing_cond_len > 0:
                 init_cond = torch.zeros(1, missing_cond_len).to(music_tokens_cond.device)
@@ -2820,6 +2821,7 @@ def _sample(
               353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
              1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
              1804,  541, 1804, 1434]])
+
         ```"""
 
         top_prior = self.priors[-1]
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 0e33272179469..5fc749c3de6b2 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -72,10 +72,9 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     >>> from transformers import JukeboxTokenizer
     >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
     >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
-    [tensor([[  0,   0,   0, 145,   0]]),
-     tensor([[  0,   0,   0, 145,   0]]),
-     tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
-                40,   76,   44,   41,   27,   30]])]
+    [tensor([[  0,   0,   0, 145,   0]]), tensor([[  0,   0,   0, 145,   0]]), tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
+               40,   76,   44,   41,   27,   30]])]
+
     ```
 
     You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you

From 2b76a6f57daf68ad5427db1e2c71a8ed738d6bf1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 24 Oct 2022 07:32:32 +0000
Subject: [PATCH 143/196] quality

---
 src/transformers/models/jukebox/modeling_jukebox.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 5c5a1a8aadbc1..6f5b97e105acf 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -863,7 +863,6 @@ def forward(self, raw_audio):
         >>> zs = [torch.randint(100, (4, 1))]
         >>> model.decode(zs).shape
         torch.Size([4, 8, 1])
-
         ```"""
 
         # Encode/Decode
@@ -2821,7 +2820,6 @@ def _sample(
               353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
              1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
              1804,  541, 1804, 1434]])
-
         ```"""
 
         top_prior = self.priors[-1]

From 693e2c17cdd180e64371a7d519a7f3fc25f7e4e8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 24 Oct 2022 07:40:51 +0000
Subject: [PATCH 144/196] fix returns in doc

---
 src/transformers/models/jukebox/modeling_jukebox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 6f5b97e105acf..2a138a28757db 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2799,7 +2799,7 @@ def _sample(
             sample_length (`int`, *optional*, defaults to None):
                 Desired lenght of the generation in samples.
 
-        Returns:
+        Returns: torch.Tensor
 
         Example:
 

From 27bfc841c0128a05a729747012896fa393879a8f Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 26 Oct 2022 14:27:35 +0200
Subject: [PATCH 145/196] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .../models/jukebox/configuration_jukebox.py          | 12 ++++++------
 src/transformers/models/jukebox/modeling_jukebox.py  |  2 +-
 .../models/jukebox/tokenization_jukebox.py           |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 51253474c22f7..5e5bef00451cf 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -43,7 +43,7 @@ class JukeboxConfig(PretrainedConfig):
     Args:
         sampling_rate (`int`, *optional*, defaults to 44100):
             Sampling rate of the raw audio.
-        metadata_dims (`list`, *optional*, defaults to [(604, 7898), (120, 4111), (120, 4111)]):
+        metadata_dims (`List[Tuple[int, int]]`, *optional*, defaults to `[(604, 7898), (120, 4111), (120, 4111)]`):
             List containing the number of genres and the number of artists that were used to train the embedding layers
             of each of the prior models.
         nb_priors (`int`, *optional*, defaults to 3):
@@ -54,10 +54,10 @@ class JukeboxConfig(PretrainedConfig):
             Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
             layer. The timing embedding layer converts the absolute and relative position in the currently sampled
             audio to a tensor of lenght `timing_dims` that will be added to the music tokens.
-        single_enc_dec (`list`, *optional*, defaults to [True, False, False]):
+        single_enc_dec (`List[bool]`, *optional*, defaults to `[True, False, False]`):
             Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
             `lyric_encoder` for each of the priors.
-        metadata_conditioning (`bool`, *optional*, defaults to True):
+        metadata_conditioning (`bool`, *optional*, defaults to `True`):
             Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
             duration.
         merged_decoder (`list`, *optional*, defaults to [True, False, False]):
@@ -95,7 +95,7 @@ class JukeboxConfig(PretrainedConfig):
             Downsampling rates used in the audio conditioning network
         cond_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
             Striding used in the audio conditioning network
-        lyric_enc_spread (`bool`, *optional*, defaults to False):
+        lyric_enc_spread (`bool`, *optional*, defaults to `False`):
             Spread used in the attention pattern
         lyric_enc_width (`list`, *optional*, defaults to [128, 128, 128]):
             Width of the lyric encoder
@@ -123,9 +123,9 @@ class JukeboxConfig(PretrainedConfig):
             Residual dropout used in the attention pattern of the lyric encoder.
         lyric_enc_emb_dropout (`float`, *optional*, defaults to 0.0):
             Embedding dropout used in the lyric encoder.
-        lyric_enc_zero_out (`bool`, *optional*, defaults to False):
+        lyric_enc_zero_out (`bool`, *optional*, defaults to `False`):
             Whether or not to set to zeros the weights the MLPs in the lyric encoder.
-        lyric_enc_res_scale (`bool`, *optional*, defaults to False):
+        lyric_enc_res_scale (`bool`, *optional*, defaults to `False`):
             Residual scaling factor used in the lyric encoder attention patterns.
         lyric_enc_n_vocab (`int`, *optional*, defaults to 79):
             Defines the number of different tokens that can be represented by the `inputs_ids` passed to the
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2a138a28757db..112cd1afc8a6a 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -216,7 +216,7 @@ def get_mask(mask, query_length, key_value_length, blocks, spread, device, sampl
 
 class JukeboxConv1D(nn.Module):
     def __init__(self, n_in, n_out, zero_out=False):
-        super(JukeboxConv1D, self).__init__()
+        super().__init__()
         self.n_in = n_in
         self.n_out = n_out
         if zero_out:
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 5fc749c3de6b2..49ab4669f01d5 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -99,14 +99,14 @@ class JukeboxTokenizer(PreTrainedTokenizer):
             Path to the vocabulary file which contain a mapping between genres and ids.
         lyrics_file (`str`):
             Path to the vocabulary file which contains the accepted characters for the lyrics tokenization.
-        version (`List[`str`], `optional`, default to ["v3", "v2", "v2"]) :
+        version (`List[str]`, `optional`, default to `["v3", "v2", "v2"]`) :
             List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
             `v2`.
         n_genres (`int`, `optional`, defaults to 1):
             Maximum number of genres to use for composition.
         max_n_lyric_tokens (`int`, `optional`, defaults to 512):
             Maximum number of lyric tokens to keep.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
     """

From ef3bd92426b96b99b7bfc0d5eb75890dfff0e39b Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 26 Oct 2022 15:16:42 +0200
Subject: [PATCH 146/196] Update README.md

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8c507804f68bc..de6182fc78a18 100644
--- a/README.md
+++ b/README.md
@@ -317,7 +317,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[Jukebox](https://huggingface.co/docs/transformers/main/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.

From fa2556c048b4fbfe1abc55796f26f898437550c1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 2 Nov 2022 20:58:44 +0000
Subject: [PATCH 147/196] HUGE refactoring of the code

---
 README.md                                     |    2 +-
 .../models/jukebox/configuration_jukebox.py   |  734 ++++++---
 .../models/jukebox/convert_jukebox.py         |   42 +-
 .../models/jukebox/modeling_jukebox.py        | 1305 +++++++----------
 .../models/jukebox/tokenization_jukebox.py    |    4 +-
 5 files changed, 1045 insertions(+), 1042 deletions(-)

diff --git a/README.md b/README.md
index 8c507804f68bc..de6182fc78a18 100644
--- a/README.md
+++ b/README.md
@@ -317,7 +317,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[Jukebox](https://huggingface.co/docs/transformers/main/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 51253474c22f7..b56c5b60f7958 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -14,6 +14,10 @@
 # limitations under the License.
 """ Jukebox configuration"""
 
+import copy
+import os
+from typing import List, Union
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
@@ -25,182 +29,519 @@
     "openai/jukebox-1b-lyrics": "https://huggingface.co/openai/jukebox-1b-lyrics/blob/main/config.json",
 }
 
+_LARGE_ATTENTION = [
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+]
+_RawColumnPreviousRowAttention = ["block_attn", "transpose_block_attn", "prev_block_attn"]
+_FullDenseAttention = ["dense_attention"]
+_PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"]
 
-class JukeboxConfig(PretrainedConfig):
+ATTENTION_PATTERNS = {
+    "FullDenseAttention": lambda layer: _FullDenseAttention[0],
+    "RawColumnPreviousRowAttention": lambda layer: _RawColumnPreviousRowAttention[
+        layer % 3
+    ],  # Alternate row, column and previous row attn
+    "large_separated_enc_dec_w_lyrics": lambda layer: _LARGE_ATTENTION[
+        layer % 79
+    ],  # Used by large separated_enc_dec model with lyrics
+    "single_enc_dec_w_lyrics": lambda layer: _PrimePrimeDenseAttention[layer % 3]
+    if layer % 16 == 15
+    else _RawColumnPreviousRowAttention[layer % 3],  # Used by single_enc_dec model with lyrics
+}
+
+
+class JukeboxPriorConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a [`JukeboxModel`].
+    This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
+    `JukeboxPriorl` according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the top level prior fro the
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/ukebox-1b-lyrics) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
-    yield a similar configuration to that of
-    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
+    documentation from [`PretrainedConfig`] for more information.
 
 
-    The downsampling and stride are used to determine downsampling of the input sequence. For example, downsamoling =
-    (5,3), and strides = (2, 2) will downsample the audio by 2**5 = 32 to get the first level of codes, and 2**8 = 256
-    to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
 
     Args:
-        sampling_rate (`int`, *optional*, defaults to 44100):
-            Sampling rate of the raw audio.
-        metadata_dims (`list`, *optional*, defaults to [(604, 7898), (120, 4111), (120, 4111)]):
+        metadata_dims (`List[Tuple[int, int]]`, *optional*, defaults to `[(604, 7898), (120, 4111), (120, 4111)]`):
             List containing the number of genres and the number of artists that were used to train the embedding layers
             of each of the prior models.
-        nb_priors (`int`, *optional*, defaults to 3):
-            Number of prior models that will sequentialy sample tokens. Each prior is conditional auto regressive
-            (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
-            trained using a top prior and 2 upsampler priors.
-        timing_dims (`int`, *optional*, defaults to 64):
-            Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
-            layer. The timing embedding layer converts the absolute and relative position in the currently sampled
-            audio to a tensor of lenght `timing_dims` that will be added to the music tokens.
-        single_enc_dec (`list`, *optional*, defaults to [True, False, False]):
+        single_enc_dec (`List[bool]`, *optional*, defaults to `[True, False, False]`):
             Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
-            `lyric_encoder` for each of the priors.
-        metadata_conditioning (`bool`, *optional*, defaults to True):
-            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
-            duration.
+            `encoderoder` for each of the priors.
         merged_decoder (`list`, *optional*, defaults to [True, False, False]):
             Whether or not the decoder is merged with the encoder.
         lyric_conditioning (`list`, *optional*, defaults to [True, False, False]):
             Whether or not to use the lyrics as conditioning.
         nb_relevant_lyric_tokens (`list`, *optional*, defaults to [384, 0, 0]):
             Number of tokens that are used when sampling a single window of length `prior_n_ctx`
-        min_duration (`float`, *optional*, defaults to 17.84):
-            Minimum duration of the audios to generate
-        max_duration (`float`, *optional*, defaults to 600.0):
-            Maximum duration of the audios to generate
-        max_nb_genres (`int`, *optional*, defaults to 5):
-            Maximum number of genres that can be used to condition a single sample.
-        init_std (`float`, *optional*, defaults to 0.2):
-            Standard deviation used to inital the model.
-        hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
-            Fraction of non-intersecting window used when continuing the sampling process.
-        cond_zero_out (`bool`, *optional*, defaults to False):
+        zero_out (`bool`, *optional*, defaults to False):
             Zero out weights when initialising.
-        cond_depth (`list`, *optional*, defaults to [3, 16, 16]):
+        depth (`list`, *optional*, defaults to [3, 16, 16]):
             Number of layers to use for the music conditioner.
-        cond_width (`list`, *optional*, defaults to [128, 1024, 1024]):
+        width (`list`, *optional*, defaults to [128, 1024, 1024]):
             Width of the audio conditioning layer.
-        cond_dilation_growth_rate (`list`, *optional*, defaults to [1, 3, 3]):
+        dilation_growth_rate (`list`, *optional*, defaults to [1, 3, 3]):
             Dilation grow rate used between each convolutionnal block.
-        cond_dilation_cycle (`list`, *optional*, defaults to [None, 8, 8]):
+        dilation_cycle (`list`, *optional*, defaults to [None, 8, 8]):
             Cycle of dilation to use. Usually similar to the ones used in the VQVAE.
-        cond_res_scale (`list`, *optional*, defaults to [None, True, False]):
+        res_scale (`list`, *optional*, defaults to [None, True, False]):
             Wheter or not to scale the residuals in the audio conditionner block. Since the top level prior doeas not
             have a conditionner, the default value is to None and should not be modified.
-        cond_m_conv (`int`, *optional*, defaults to 1):
+        convolution_multiplier (`int`, *optional*, defaults to 1):
             Conditionner multiplier (the input states are mulitplied by that parameter for each convolution.
-        cond_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
+        downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
             Downsampling rates used in the audio conditioning network
-        cond_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
+        strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
             Striding used in the audio conditioning network
-        lyric_enc_spread (`bool`, *optional*, defaults to False):
+        encoder_spread (`bool`, *optional*, defaults to `False`):
             Spread used in the attention pattern
-        lyric_enc_width (`list`, *optional*, defaults to [128, 128, 128]):
+        encoder_width (`list`, *optional*, defaults to [128, 128, 128]):
             Width of the lyric encoder
-        lyric_enc_depth (`list`, *optional*, defaults to [18, 3, 3]):
+        encoder_depth (`list`, *optional*, defaults to [18, 3, 3]):
             Number of encoder blocks used in the lyric encoder
-        lyric_enc_heads (`int`, *optional*, defaults to 4):
+        encoder_heads (`int`, *optional*, defaults to 4):
             Number of heads in the lyric encoder
-        lyric_enc_m_attn (`float`, *optional*, defaults to 0.25):
+        encoder_attention_multiplier (`float`, *optional*, defaults to 0.25):
             Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
             0.25*width of the model will be used.
-        lyric_enc_m_mlp (`float`, *optional*, defaults to 1.0):
+        encoder_mlp_multiplier (`float`, *optional*, defaults to 1.0):
             Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
             the model will be used.
-        lyric_enc_blocks (`int`, *optional*, defaults to 32):
+        encoder_blocks (`int`, *optional*, defaults to 32):
             Sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
-        lyric_enc_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):
+        encoder_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):
             Initialisation scales for the lyric encoder modules.
-        lyric_enc_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
+        encoder_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
             Multiplication factor used in front of the lyric encoder loss. Each value is for a particular level.
-        lyric_enc_attn_order (`list`, *optional*, defaults to [2, 0, 0]):
+        encoder_attention_pattern (`list`, *optional*, defaults to [2, 0, 0]):
             Which attention pattern to use for the lyric encoder.
-        lyric_enc_attn_dropout (`float`, *optional*, defaults to 0.0):
+        encoder_attn_dropout (`float`, *optional*, defaults to 0.0):
             Dropout probability for the post-attention layer dropout in the lyric encoder.
-        lyric_enc_resid_dropout (`float`, *optional*, defaults to 0.0):
+        encoder_resid_dropout (`float`, *optional*, defaults to 0.0):
             Residual dropout used in the attention pattern of the lyric encoder.
-        lyric_enc_emb_dropout (`float`, *optional*, defaults to 0.0):
+        encoder_emb_dropout (`float`, *optional*, defaults to 0.0):
             Embedding dropout used in the lyric encoder.
-        lyric_enc_zero_out (`bool`, *optional*, defaults to False):
+        encoder_zero_out (`bool`, *optional*, defaults to `False`):
             Whether or not to set to zeros the weights the MLPs in the lyric encoder.
-        lyric_enc_res_scale (`bool`, *optional*, defaults to False):
+        encoder_res_scale (`bool`, *optional*, defaults to `False`):
             Residual scaling factor used in the lyric encoder attention patterns.
-        lyric_enc_n_vocab (`int`, *optional*, defaults to 79):
+        encoder_n_vocab (`int`, *optional*, defaults to 79):
             Defines the number of different tokens that can be represented by the `inputs_ids` passed to the
-            `lyric_encoder`
-        prior_init_scale (`list`, *optional*, defaults to [0.2, 1, 1]):
+            `encoderoder`
+        init_scale (`list`, *optional*, defaults to [0.2, 1, 1]):
             Initialisation scales for the prior modules.
-        prior_spread (`bool`, *optional*, defaults to False):
+        spread (`bool`, *optional*, defaults to False):
             Spread used in the attention pattern
-        prior_zero_out (`bool`, *optional*, defaults to False):
+        zero_out (`bool`, *optional*, defaults to False):
              Whether or not to set to zeros the weights the MLPs of the priors.
-        prior_res_scale (`bool`, *optional*, defaults to False):
+        res_scale (`bool`, *optional*, defaults to False):
             Residual scaling factor used in every prior's attention layer.
-        prior_n_ctx (`tuple`, *optional*, defaults to (6144, 8192, 8192)):
+        n_ctx (`tuple`, *optional*, defaults to (6144, 8192, 8192)):
             Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
             generating music tokens.
-        prior_latent_dim (`int`, *optional*, defaults to 2048):
+        latent_dim (`int`, *optional*, defaults to 2048):
             Dimension of the latent music token space. Default value match the `vqvae_codebook_dimension`.
-        prior_width (`list`, *optional*, defaults to [2048, 1920, 1920]):
+        width (`list`, *optional*, defaults to [2048, 1920, 1920]):
             Input and output dimension of the attention layers of each prior.
-        prior_m_attn (`float`, *optional*, defaults to 0.25):
+        attention_multiplier (`float`, *optional*, defaults to 0.25):
             Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
-            0.25*prior_width of the model will be used.
-        prior_depth (`list`, *optional*, defaults to [72, 72, 72]):
+            0.25*width of the model will be used.
+        depth (`list`, *optional*, defaults to [72, 72, 72]):
             Depth of each prior. Defines the number of `attn_block`.
-        prior_n_heads (`list`, *optional*, defaults to [2, 1, 1]):
+        n_heads (`list`, *optional*, defaults to [2, 1, 1]):
             Number of attention heads per prior.
-        prior_attn_order (`list`, *optional*, defaults to [12, 2, 2]):
+        attention_pattern (`list`, *optional*, defaults to [12, 2, 2]):
             Attention patterns to use in each prior. Depending on the value, cross attention, block attention and
             sparse attention blocks are stacked.
-        prior_blocks (`int`, *optional*, defaults to 64):
+        blocks (`int`, *optional*, defaults to 64):
             Sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
-        prior_alignment_layer (`list`, *optional*, defaults to [68, None, None]):
+        alignment_layer (`list`, *optional*, defaults to [68, None, None]):
             Layer corresponding to the alignemnt between the lyrics and the audio.
-        prior_alignment_head (`list`, *optional*, defaults to [2, None, None]):
+        alignment_head (`list`, *optional*, defaults to [2, None, None]):
             Index of the attention head which takes care of the alignemnt between the lyrics and the audio.
-        prior_attn_dropout (`int`, *optional*, defaults to 0):
+        attn_dropout (`int`, *optional*, defaults to 0):
             Dropout probability for the post-attention layer dropout of the prior models.
-        prior_resid_dropout (`int`, *optional*, defaults to 0):
+        resid_dropout (`int`, *optional*, defaults to 0):
             Residual dropout probability used in the attention layers of the prior models.
-        prior_emb_dropout (`int`, *optional*, defaults to 0):
+        emb_dropout (`int`, *optional*, defaults to 0):
             Dropout applied to the embedding layer of the priors.
-        vqvae_levels (`int`, *optional*, defaults to 3):
+    """
+
+    model_type = "jukebox"
+    attribute_map = {
+        "hidden_size": "vqvae_codebook_dimension",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+    }
+
+    def __init__(
+        self,
+        sampling_rate = 44100,
+        timing_dims = 64,
+        min_duration = 0,
+        max_duration = 600,
+        max_nb_genres =  1,
+        metadata_conditioning = True,
+        zero_out=False,
+        res_conv_depth=3,
+        res_conv_width=128,
+        res_dilation_growth_rate=1,
+        res_dilation_cycle=None,
+        res_scale=None,
+        res_convolution_multiplier=1,
+        res_downs_t=(3, 2, 2),
+        res_strides_t=(2, 2, 2),
+        encoder_spread=None,
+        encoder_width=128,
+        encoder_depth=18,
+        encoder_heads=4,
+        encoder_attention_multiplier=0.25,
+        encoder_mlp_multiplier=1.0,
+        encoder_blocks=32,
+        encoder_init_scale=0.1,
+        encoder_loss_fraction=[0.4, 0.0, 0.0],
+        encoder_attention_pattern="RawColumnPreviousRowAttention",
+        encoder_attn_dropout=0.0,
+        encoder_resid_dropout=0.0,
+        encoder_emb_dropout=0.0,
+        encoder_zero_out=False,
+        encoder_res_scale=False,
+        encoder_n_vocab=79,
+        init_scale=0.2,
+        n_ctx=6144,
+        width=2048,
+        depth=72,
+        n_heads=2,
+        attention_pattern="single_enc_dec_w_lyrics",
+        alignment_layer=68,
+        alignment_head=2,
+        metadata_dims=(604, 7898),
+        single_enc_dec=True,
+        merged_decoder=True,
+        lyric_conditioning=True,
+        nb_relevant_lyric_tokens=384,
+        embed_dim=2048,
+        spread=None,
+        blocks=64,
+        attention_multiplier=0.25,
+        mlp_multiplier=1.0,
+        attn_dropout=0,
+        resid_dropout=0,
+        emb_dropout=0,
+        mask=False,
+        act_fn="quick_gelu",
+        **kwargs
+    ):
+        self.metadata_dims = metadata_dims
+        self.res_conv_depth = res_conv_depth
+        self.res_conv_width = res_conv_width
+        #  Auto regressive (decoder) kwargs :
+        self.attention_pattern = attention_pattern
+        self.n_heads = n_heads
+        self.depth = depth
+        self.width = width
+        self.n_ctx = n_ctx
+        self.embed_dim = embed_dim
+        self.attn_dropout = attn_dropout
+        self.resid_dropout = resid_dropout
+        self.emb_dropout = emb_dropout
+        self.zero_out = zero_out
+        self.res_scale = res_scale
+        self.blocks = blocks
+        self.attention_multiplier = attention_multiplier
+        self.mlp_multiplier = mlp_multiplier
+        self.spread = spread
+        self.alignment_layer = alignment_layer
+        self.alignment_head = alignment_head
+        self.init_scale = init_scale
+
+        # Audio conditioning : upsampler parameters
+        self.depth = depth
+        self.width = width
+        self.res_dilation_growth_rate = res_dilation_growth_rate
+        self.res_dilation_cycle = res_dilation_cycle
+        self.zero_out = zero_out
+        self.res_convolution_multiplier = res_convolution_multiplier
+        self.res_scale = res_scale
+        self.res_downs_t = res_downs_t
+        self.res_strides_t = res_strides_t
+
+        # Lyric conditioning
+        self.merged_decoder = merged_decoder  # is this equivalent ?
+        self.single_enc_dec = single_enc_dec
+        self.lyric_conditioning = lyric_conditioning
+        self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
+
+        self.encoder_attn_dropout = encoder_attn_dropout
+        self.encoder_attention_pattern = encoder_attention_pattern
+        self.encoder_blocks = encoder_blocks
+        self.encoder_depth = encoder_depth
+        self.encoder_emb_dropout = encoder_emb_dropout
+        self.encoder_heads = encoder_heads
+        self.encoder_init_scale = encoder_init_scale
+        self.encoder_loss_fraction = encoder_loss_fraction
+        self.encoder_attention_multiplier = encoder_attention_multiplier
+        self.encoder_mlp_multiplier = encoder_mlp_multiplier
+        self.encoder_resid_dropout = encoder_resid_dropout
+        self.encoder_res_scale = encoder_res_scale
+        self.encoder_spread = encoder_spread
+        self.encoder_width = encoder_width
+        self.encoder_zero_out = encoder_zero_out
+        self.encoder_n_vocab = encoder_n_vocab
+        self.mask = mask
+        self.act_fn = act_fn
+
+        self.sampling_rate = sampling_rate
+        self.timing_dims = timing_dims
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.max_nb_genres =  max_nb_genres
+        self.metadata_conditioning = metadata_conditioning
+
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "jukebox_prior":
+            config_dict = config_dict["prior_configs"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class JukeboxVQVAEConfig(PretrainedConfig):
+    """
+        hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
+            Fraction of non-intersecting window used when continuing the sampling process.
+        input_channels:
+            number of audio channels
+        sample_length:
+            on which the VQVAE was trained. Provides the max output shape of the VQVAE
+        levels (`int`, *optional*, defaults to 3):
             Number of hierachical levels that used in the VQVAE.
-        vqvae_downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
+        downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
             Downsampling rate for each level of the hierachical VQ-VAE.
-        vqvae_strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
+        strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
             Stride used for each level of the hierachical VQ-VAE.
-        vqvae_emmbedding_width (`int`, *optional*, defaults to 64):
+        embed_dim (`int`, *optional*, defaults to 64):
             Dimension of the codebook vectors.
-        vqvae_codebook_dimension (`int`, *optional*, defaults to 2048):
+        codebook_dimension (`int`, *optional*, defaults to 2048):
             Number of codes to use in each of the VQVAE.
-        vqvae_m_conv (`int`, *optional*, defaults to 1):
+        convolution_multiplier (`int`, *optional*, defaults to 1):
             Projection factor used in the `JukeboxResConv1DBlock`.
-        vqvae_dilation_growth_rate (`int`, *optional*, defaults to 3):
+        dilation_growth_rate (`int`, *optional*, defaults to 3):
             Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
-        vqvae_dilation_cycle (`int`, *optional*, defaults to None):
+        dilation_cycle (`int`, *optional*, defaults to None):
             Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
-            of reduced by a power of `vqvae_dilation_cycle`.
-        vqvae_multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
-            Depth and width multipliers used for each level. Used on the `vqvae_conv_block_width` and
-            `vqvae_conv_block_depth`
-        vqvae_lmu (`float`, *optional*, defaults to 0.99):
+            of reduced by a power of `dilation_cycle`.
+        multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
+            Depth and width multipliers used for each level. Used on the `conv_block_width` and `conv_block_depth`
+        lmu (`float`, *optional*, defaults to 0.99):
             Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
             of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
-        vqvae_commit (`float`, *optional*, defaults to 0.02):
+        commit (`float`, *optional*, defaults to 0.02):
             Commit loss multiplier.
-        vqvae_conv_block_depth (`int`, *optional*, defaults to 4):
-            Depth of the encoder and decoder block. If no `vqvae_multipliers` are used, this is the same for each
-            level.
-        vqvae_conv_block_width (`int`, *optional*, defaults to 32):
-            Width of the encoder and decoder block. If no `vqvae_multipliers` are used, this is the same for each
-            level.
-        vqvae_reverse_decoder_dilation (`int`, *optional*, defaults to 1):
+        conv_block_depth (`int`, *optional*, defaults to 4):
+            Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
+        conv_block_width (`int`, *optional*, defaults to 32):
+            Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
+        reverse_decoder_dilation (`int`, *optional*, defaults to 1):
             Whether or not to reverse the dilation rate for the decoder.
     Example:
+    """
+
+    def __init__(
+        self,
+        hop_fraction=[0.125, 0.5, 0.5],
+        sample_length=1058304,
+        levels=3,
+        embed_dim=64,
+        codebook_dimension=2048,
+        lmu=0.99,
+        commit=0.02,
+        conv_input_shape=1,
+        res_downs_t=(3, 2, 2),
+        res_strides_t=(2, 2, 2),
+        multipliers=(2, 1, 1),
+        res_conv_width=32,
+        res_conv_depth=4,
+        res_convolution_multiplier=1,
+        res_dilation_growth_rate=3,
+        res_dilation_cycle=None,
+        res_scale=False,
+        act_fn="relu",
+        **kwargs
+    ):
+        self.hop_fraction = hop_fraction
+        self.conv_input_shape = conv_input_shape
+        self.sample_length = sample_length
+
+        # VQVAE parameters (all used)
+        self.levels = levels
+        self.embed_dim = embed_dim
+        self.codebook_dimension = codebook_dimension
+        self.res_conv_width = res_conv_width
+        self.res_conv_depth = res_conv_depth
+        self.res_convolution_multiplier = res_convolution_multiplier
+        self.res_dilation_growth_rate = res_dilation_growth_rate
+        self.res_dilation_cycle = res_dilation_cycle
+        self.multipliers = multipliers
+        self.res_downs_t = res_downs_t
+        self.res_strides_t = res_strides_t
+        self.lmu = lmu
+        self.commit = commit
+        self.res_scale = res_scale
+        self.act_fn = act_fn
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "jukebox":
+            config_dict = config_dict["vqvae_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+class JukeboxConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`JukeboxModel`].
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
+    yield a similar configuration to that of
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
+
+
+    The downsampling and stride are used to determine downsampling of the input sequence. For example, downsamoling =
+    (5,3), and strides = (2, 2) will downsample the audio by 2**5 = 32 to get the first level of codes, and 2**8 = 256
+    to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
+
+    Args:
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            Sampling rate of the raw audio.
+        nb_priors (`int`, *optional*, defaults to 3):
+            Number of prior models that will sequentialy sample tokens. Each prior is conditional auto regressive
+            (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
+            trained using a top prior and 2 upsampler priors.
+        timing_dims (`int`, *optional*, defaults to 64):
+            Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
+            layer. The timing embedding layer converts the absolute and relative position in the currently sampled
+            audio to a tensor of lenght `timing_dims` that will be added to the music tokens.
+        metadata_conditioning (`bool`, *optional*, defaults to `True`):
+            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
+            duration.
+        single_enc_dec (`List[bool]`, *optional*, defaults to `[True, False, False]`):
+            Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
+            `encoderoder` for each of the priors.
+        merged_decoder (`list`, *optional*, defaults to [True, False, False]):
+            Whether or not the encoders are merged. This means that the input of.
+        lyric_conditioning (`list`, *optional*, defaults to [True, False, False]):
+            Whether or not to use the lyrics as conditioning.
+        nb_relevant_lyric_tokens (`list`, *optional*, defaults to [384, 0, 0]):
+            Number of tokens that are used when sampling a single window of length `n_ctx`
+        min_duration (`float`, *optional*, defaults to 17.84):
+            Minimum duration of the audios to generate
+        max_duration (`float`, *optional*, defaults to 600.0):
+            Maximum duration of the audios to generate
+        max_nb_genres (`int`, *optional*, defaults to 5):
+            Maximum number of genres that can be used to condition a single sample.
+        init_std (`float`, *optional*, defaults to 0.2):
+            Standard deviation used to inital the model.
 
     ```python
     >>> from transformers import JukeboxModel, JukeboxConfig
@@ -218,169 +559,86 @@ class JukeboxConfig(PretrainedConfig):
 
     model_type = "jukebox"
     attribute_map = {
-        "hidden_size": "vqvae_codebook_dimension",
+        "hidden_size": "codebook_dimension",
         "max_position_embeddings": "n_positions",
         "num_attention_heads": "n_head",
     }
+    is_composition = True
 
     def __init__(
         self,
-        sampling_rate=44100,
-        metadata_dims=[(604, 7898), (120, 4111), (120, 4111)],
+        vqvae_config=None,
+        prior_config_list=None,
         nb_priors=3,
+        sampling_rate=44100,
         timing_dims=64,
-        single_enc_dec=[True, False, False],
-        metadata_conditioning=True,
-        merged_decoder=[True, False, False],
-        lyric_conditioning=[True, False, False],
-        nb_relevant_lyric_tokens=[384, 0, 0],
-        min_duration=17.84,
+        min_duration=0,
         max_duration=600.0,
         max_nb_genres=5,
+        metadata_conditioning = True,
         init_std=0.2,
-        hop_fraction=[0.125, 0.5, 0.5],
-        cond_zero_out=False,
-        cond_depth=[3, 16, 16],
-        cond_width=[128, 1024, 1024],
-        cond_dilation_growth_rate=[1, 3, 3],
-        cond_dilation_cycle=[None, 8, 8],
-        cond_res_scale=[None, True, False],
-        cond_m_conv=1,
-        cond_downs_t=(3, 2, 2),
-        cond_strides_t=(2, 2, 2),
-        lyric_enc_spread=None,
-        lyric_enc_width=[128, 128, 128],
-        lyric_enc_depth=[18, 3, 3],
-        lyric_enc_heads=4,
-        lyric_enc_m_attn=0.25,
-        lyric_enc_m_mlp=1.0,
-        lyric_enc_blocks=32,
-        lyric_enc_init_scale=[0.1, 0.4, 0.4],
-        lyric_enc_loss_fraction=[0.4, 0.0, 0.0],
-        lyric_enc_attn_order=[2, 0, 0],
-        lyric_enc_attn_dropout=0.0,
-        lyric_enc_resid_dropout=0.0,
-        lyric_enc_emb_dropout=0.0,
-        lyric_enc_zero_out=False,
-        lyric_enc_res_scale=False,
-        lyric_enc_n_vocab=79,
-        prior_init_scale=[0.2, 1, 1],
-        prior_spread=None,
-        prior_zero_out=False,
-        prior_res_scale=False,
-        prior_n_ctx=(6144, 8192, 8192),
-        prior_latent_dim=2048,
-        prior_width=[2048, 1920, 1920],
-        prior_depth=[72, 72, 72],
-        prior_n_heads=[2, 1, 1],
-        prior_attn_order=[12, 2, 2],
-        prior_blocks=64,
-        prior_alignment_layer=[68, None, None],
-        prior_alignment_head=[2, None, None],
-        prior_m_attn=0.25,
-        prior_attn_dropout=0,
-        prior_resid_dropout=0,
-        prior_emb_dropout=0,
-        vqvae_levels=3,
-        vqvae_downs_t=(3, 2, 2),
-        vqvae_strides_t=(2, 2, 2),
-        vqvae_emmbedding_width=64,
-        vqvae_codebook_dimension=2048,
-        vqvae_width=32,
-        vqvae_depth=4,
-        vqvae_m_conv=1,
-        vqvae_dilation_growth_rate=3,
-        vqvae_dilation_cycle=None,
-        vqvae_multipliers=(2, 1, 1),
-        vqvae_lmu=0.99,
-        vqvae_commit=0.02,
-        vqvae_conv_block_depth=4,
-        vqvae_conv_block_width=32,
-        vqvae_reverse_decoder_dilation=1,
         **kwargs,
     ):
-        self.init_std = init_std
-        self.nb_priors = nb_priors
-        self.hop_fraction = hop_fraction
 
-        #  Auto regressive (decoder) kwargs :
-        self.prior_attn_order = prior_attn_order
-        self.prior_n_heads = prior_n_heads
-        self.prior_depth = prior_depth
-        self.prior_width = prior_width
-        self.prior_n_ctx = prior_n_ctx
-        self.prior_latent_dim = prior_latent_dim
-        self.prior_attn_dropout = prior_attn_dropout
-        self.prior_resid_dropout = prior_resid_dropout
-        self.prior_emb_dropout = prior_emb_dropout
-        self.prior_zero_out = prior_zero_out
-        self.prior_res_scale = prior_res_scale
-        self.prior_blocks = prior_blocks
-        self.prior_m_attn = prior_m_attn
-        self.prior_spread = prior_spread
-        self.prior_alignment_layer = prior_alignment_layer
-        self.prior_alignment_head = prior_alignment_head
-        self.prior_init_scale = prior_init_scale
+        if vqvae_config is None:
+            vqvae_config = {}
+            logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
 
-        # Audio conditioning : upsampler parameters
-        self.cond_depth = cond_depth
-        self.cond_width = cond_width
-        self.cond_dilation_growth_rate = cond_dilation_growth_rate
-        self.cond_dilation_cycle = cond_dilation_cycle
-        self.cond_zero_out = cond_zero_out
-        self.cond_m_conv = cond_m_conv
-        self.cond_res_scale = cond_res_scale
-        self.cond_downs_t = cond_downs_t
-        self.cond_strides_t = cond_strides_t
+        self.vqvae_config = JukeboxVQVAEConfig(**vqvae_config)
+        if prior_config_list is not None :
+            self.prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list]
+        else:
+            self.prior_configs = []
+            for prior_idx in range(nb_priors):
+                prior_config = kwargs.pop(f"prior_{prior_idx}", None)
+                if prior_config is None:
+                    prior_config = {}
+                    logger.info(f"prior_{prior_idx}'s  config is None. Initializing the JukeboxPriorConfig list with default values.")
+                self.prior_configs.append(JukeboxPriorConfig(**prior_config))
+
+
+        self.hop_fraction = self.vqvae_config.hop_fraction
+
+        self.init_std = init_std
+        self.nb_priors = nb_priors
 
         # Metadata conditioning
         self.max_nb_genres = max_nb_genres
         self.sampling_rate = sampling_rate
-        self.metadata_dims = metadata_dims
         self.timing_dims = timing_dims
         self.min_duration = min_duration
         self.max_duration = max_duration
         self.metadata_conditioning = metadata_conditioning
 
-        # Lyric conditioning
-        self.merged_decoder = merged_decoder  # is this equivalent ?
-        self.single_enc_dec = single_enc_dec
-        self.lyric_conditioning = lyric_conditioning
-        self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
+        super().__init__(**kwargs)
 
-        self.lyric_enc_attn_dropout = lyric_enc_attn_dropout
-        self.lyric_enc_attn_order = lyric_enc_attn_order
-        self.lyric_enc_blocks = lyric_enc_blocks
-        self.lyric_enc_depth = lyric_enc_depth
-        self.lyric_enc_emb_dropout = lyric_enc_emb_dropout
-        self.lyric_enc_heads = lyric_enc_heads
-        self.lyric_enc_init_scale = lyric_enc_init_scale
-        self.lyric_enc_loss_fraction = lyric_enc_loss_fraction
-        self.lyric_enc_m_attn = lyric_enc_m_attn
-        self.lyric_enc_m_mlp = lyric_enc_m_mlp
-        self.lyric_enc_resid_dropout = lyric_enc_resid_dropout
-        self.lyric_enc_res_scale = lyric_enc_res_scale
-        self.lyric_enc_spread = lyric_enc_spread
-        self.lyric_enc_width = lyric_enc_width
-        self.lyric_enc_zero_out = lyric_enc_zero_out
-        self.lyric_enc_n_vocab = lyric_enc_n_vocab
+    @classmethod
+    def from_configs(
+        cls, prior_configs: List[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs
+    ):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+        prior_config_list = [config.to_dict() for config in prior_configs]
+        return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        for i,config in enumerate(output.pop("prior_configs")):
+            output[f"prior_{i}"] = config.to_dict()
+
+        output["vqvae_config"] = self.vqvae_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
 
-        # VQVAE parameters (all used)
-        self.vqvae_levels = vqvae_levels
-        self.vqvae_downs_t = vqvae_downs_t
-        self.vqvae_strides_t = vqvae_strides_t
-        self.vqvae_emmbedding_width = vqvae_emmbedding_width
-        self.vqvae_codebook_dimension = vqvae_codebook_dimension
-        self.vqvae_width = vqvae_width
-        self.vqvae_depth = vqvae_depth
-        self.vqvae_m_conv = vqvae_m_conv
-        self.vqvae_dilation_growth_rate = vqvae_dilation_growth_rate
-        self.vqvae_dilation_cycle = vqvae_dilation_cycle
-        self.vqvae_multipliers = vqvae_multipliers
-        self.vqvae_lmu = vqvae_lmu
-        self.vqvae_commit = vqvae_commit
-        self.vqvae_conv_block_depth = vqvae_conv_block_depth
-        self.vqvae_conv_block_width = vqvae_conv_block_width
-        self.vqvae_reverse_decoder_dilation = vqvae_reverse_decoder_dilation
 
-        super().__init__(**kwargs)
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index bbf8804bedd65..25f0b58ed4c68 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -57,28 +57,42 @@ def replace_key(key):
     elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
         key = key.replace(".model.3.weight", ".conv1d_2.weight")
 
+
+
+    if "conditioner_blocks.0." in key:
+        key = key.replace("conditioner_blocks.0", "conditioner_blocks")
+
+
     if "prime_prior" in key:
-        key = key.replace("prime_prior", "lyric_encoder")
+        key = key.replace("prime_prior", "encoder")
+
+    if ".emb." in key and not "total" in key and not "absolute" in key and not "relative"  in key:
+        key = key.replace(".emb.", ".")
 
     if key.endswith("k"):  # replace vqvae.X.k with vqvae.X.codebook
         return key.replace(".k", ".codebook")
     if "y_emb." in key:
         return key.replace("y_emb.", "metadata_embedding.")
+
+    if "x_emb.emb." in key:
+        key = key.replace("0.x_emb.emb", "embed_tokens")
+
     if "prime_state_ln" in key:
-        return key.replace("prime_state_ln", "lyric_encoder.final_layer_norm")
+        return key.replace("prime_state_ln", "encoder.final_layer_norm")
     if ".ln" in key:
         return key.replace(".ln", ".layer_norm")
     if "_ln" in key:
         return key.replace("_ln", "_layer_norm")
 
     if "prime_state_proj" in key:
-        return key.replace("prime_state_proj", "lyric_encoder.proj_in")
+        return key.replace("prime_state_proj", "encoder.proj_in")
     if "prime_x_out" in key:
-        return key.replace("prime_x_out", "lyric_encoder.lm_head")
+        return key.replace("prime_x_out", "encoder.lm_head")
     if "prior.x_out" in key:
         return key.replace("x_out", "fc_proj_out")
     if "x_emb" in key:
         return key.replace("x_emb", "embed_tokens")
+
     return key
 
 
@@ -159,7 +173,7 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
             regex_match = re_prior_cond_conv_out.match(original_key)
             groups = regex_match.groups()
             block_index = int(groups[1]) * 2 + int(groups[2]) - 2
-            re_new_key = f"conditioner_blocks.{groups[0]}.upsampler.upsample_block.{block_index}.{groups[-1]}"
+            re_new_key = f"conditioner_blocks.upsampler.upsample_block.{block_index}.{groups[-1]}"
             key = re_prior_cond_conv_out.sub(re_new_key, original_key)
 
         elif re_prior_cond_resnet.fullmatch(original_key):
@@ -167,7 +181,7 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
             groups = regex_match.groups()
             block_index = int(groups[1]) * 2 + int(groups[2]) - 2
             conv_index = {"1": 1, "3": 2}[groups[-2]]
-            prefix = f"conditioner_blocks.{groups[0]}.upsampler.upsample_block.{block_index}."
+            prefix = f"conditioner_blocks.upsampler.upsample_block.{block_index}."
             resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}"
             re_new_key = prefix + resnet_block
             key = re_prior_cond_resnet.sub(re_new_key, original_key)
@@ -175,7 +189,7 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
         elif re_prior_cond_proj_in.fullmatch(original_key):
             regex_match = re_prior_cond_proj_in.match(original_key)
             groups = regex_match.groups()
-            re_new_key = f"conditioner_blocks.{groups[0]}.upsampler.proj_in.{groups[-1]}"
+            re_new_key = f"conditioner_blocks.upsampler.proj_in.{groups[-1]}"
             key = re_prior_cond_proj_in.sub(re_new_key, original_key)
 
         # keep original key
@@ -216,7 +230,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     # to convert the 5b lyric token model, use : or "openai/jukebox-5b-lyrics"
     # config = JukeboxConfig(
     #     timing_dims=128
-    #     prior_attn_order=[10, 2, 2],
+    #     prior_attention_pattern=[10, 2, 2],
     #     prior_blocks=128,
     #     prime_n_vocab=80,
     #     nb_relevant_lyric_tokens=[512, 0, 0],
@@ -233,7 +247,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     #     prior_depth=[79, 72, 72],
     #     max_nb_genres=1,
     # )
-    config = JukeboxConfig(sample_length=1058304)
+    config = JukeboxConfig.from_pretrained("ArthurZ/new-5b-lyrics")
     model = JukeboxModel(config)
 
     weight_dict = []
@@ -252,18 +266,18 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
             else:
                 new_dic[k] = old_dic[k]
 
-        key_prefix = "vqvae" if i == 0 else f"priors.{i-1}"
+        key_prefix = "vqvae" if i == 0 else f"priors.{3 - i}"
         new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping)
         weight_dict.append(new_dic)
 
     vqvae_state_dict = weight_dict.pop(0)
     model.vqvae.load_state_dict(vqvae_state_dict)
     for i in range(len(weight_dict)):
-        model.priors[i].load_state_dict(weight_dict[i])
+        model.priors[i].load_state_dict(weight_dict[2-i])
 
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile:
-        json.dump(mapping, txtfile, sep="\n")
+        json.dump(mapping, txtfile)
 
     print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
@@ -276,13 +290,13 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     # Required parameters
     parser.add_argument(
         "--model_name",
-        default="jukebox-1b-lyrics",
+        default="jukebox-5b-lyrics",
         type=str,
         help="Name of the model you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path",
-        default="jukebox-1b-lyrics-converted",
+        default="jukebox-5b-lyrics-converted",
         type=str,
         help="Path to the output PyTorch model directory.",
     )
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2a138a28757db..e7bff0c6baa62 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """PyTorch Jukebox model."""
 
+import copy
 import math
 import os
 from typing import List
@@ -28,7 +29,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, logging
 from ...utils.logging import tqdm
-from .configuration_jukebox import JukeboxConfig
+from .configuration_jukebox import JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig, ATTENTION_PATTERNS
 
 
 logger = logging.get_logger(__name__)
@@ -94,7 +95,7 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
     """
     full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
-        tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens)), full_tokens])
+        tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens), dtype = torch.long).to(full_tokens.device), full_tokens])
         indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
     else:
         midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
@@ -134,18 +135,14 @@ def get_alignment(music_tokens, labels, prior, config):
     attn_layers = set([alignment_layer])
     alignment_hops = {}
     indices_hops = {}
-    prior.to("cuda")
     for start in tqdm(get_starts(total_length, n_ctx, hop_length), desc="Computing lyric to music alignment "):
         end = start + n_ctx
         # set metadata offset, sample_length and lyrics tokens
         metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
-        metadata.to("cuda")
         tokens_bs = torch.chunk(tokens, batch_size, dim=0)
         metadata_bs = torch.chunk(metadata, batch_size, dim=0)
         w_hops = []
         for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
-            tokens_i = tokens_i.to("cuda")
-            metadata_i = metadata_i.to("cuda")
             w_hop = prior.forward_tokens(tokens_i[:, start:end], [], metadata_i, get_attn_weights=attn_layers)
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
@@ -215,76 +212,60 @@ def get_mask(mask, query_length, key_value_length, blocks, spread, device, sampl
 
 
 class JukeboxConv1D(nn.Module):
-    def __init__(self, n_in, n_out, zero_out=False):
-        super(JukeboxConv1D, self).__init__()
-        self.n_in = n_in
-        self.n_out = n_out
-        if zero_out:
-            w = torch.zeros(n_in, n_out)
-        else:
-            w = torch.empty(n_in, n_out)
+    def __init__(self, input_width, output_width):
+        super().__init__()
+        self.input_width = input_width
+        self.output_width = output_width
+        weight = torch.empty(input_width, output_width)
 
-        b = torch.zeros(n_out)
-        self.weight = nn.Parameter(w)
-        self.bias = nn.Parameter(b)
+        bias = torch.zeros(output_width)
+        self.weight = nn.Parameter(weight)
+        self.bias = nn.Parameter(bias)
 
     def forward(self, hidden_states):
-        size_out = (*hidden_states.size()[:-1], self.n_out)
+        size_out = (*hidden_states.size()[:-1], self.output_width)
         hidden_states = torch.addmm(
             self.bias.type_as(hidden_states),
             hidden_states.view(-1, hidden_states.size(-1)),
             self.weight.type_as(hidden_states),
-        )  # If hidden_states if float then float else half
+        )
         hidden_states = hidden_states.view(*size_out)
         return hidden_states
 
 
 class JukeboxResConv1DBlock(nn.Module):
-    def __init__(self, n_in, hidden_dim, dilation=1, zero_out=False, res_scale=1.0):
+    def __init__(self, config, conv_width, depth=1, res_scale=1.0):
         super().__init__()
+        hidden_dim = config.res_convolution_multiplier * conv_width
+        dilation = config.res_dilation_growth_rate ** depth
         padding = dilation
-        self.relu = nn.ReLU()
-        self.conv1d_1 = nn.Conv1d(n_in, hidden_dim, 3, 1, padding, dilation)
-        self.conv1d_2 = nn.Conv1d(hidden_dim, n_in, 1, 1, 0)
+
         self.res_scale = res_scale
+        self.activation = nn.ReLU()
+        self.conv1d_1 = nn.Conv1d(conv_width, hidden_dim, 3, 1, padding, dilation)
+        self.conv1d_2 = nn.Conv1d(hidden_dim, conv_width, 1, 1, 0)
 
     def forward(self, hidden_states):
         residuals = hidden_states
-        hidden_states = self.relu(hidden_states)
+        hidden_states = self.activation(hidden_states)
         hidden_states = self.conv1d_1(hidden_states)
-        hidden_states = self.relu(hidden_states)
+        hidden_states = self.activation(hidden_states)
         hidden_states = self.conv1d_2(hidden_states)
         return residuals + self.res_scale * hidden_states
 
 
 class JukeboxResnet1D(nn.Module):
-    def __init__(
-        self,
-        n_in,
-        n_depth,
-        m_conv=1.0,
-        dilation_growth_rate=1,
-        dilation_cycle=None,
-        zero_out=False,
-        res_scale=False,
-        reverse_dilation=False,
-    ):
+    def __init__(self, config, conv_width, n_depth, reverse_dilation=False):
         super().__init__()
+        self.dilation_cycle = config.res_dilation_cycle
+        res_scale = 1.0 if not config.res_scale else 1.0 / math.sqrt(n_depth)
 
         blocks = []
         for depth in range(n_depth):
-            block_depth = depth if dilation_cycle is None else depth % dilation_cycle
-            blocks.append(
-                JukeboxResConv1DBlock(
-                    n_in,
-                    int(m_conv * n_in),
-                    dilation=dilation_growth_rate**block_depth,
-                    zero_out=zero_out,
-                    res_scale=1.0 if not res_scale else 1.0 / math.sqrt(n_depth),
-                )
-            )
+            block_depth = depth if self.dilation_cycle is None else depth % self.dilation_cycle
+            blocks.append(JukeboxResConv1DBlock(config, conv_width, block_depth, res_scale))
 
-        if reverse_dilation:
+        if not reverse_dilation:
             blocks = blocks[::-1]
         self.resnet_block = nn.ModuleList(blocks)
 
@@ -295,31 +276,18 @@ def forward(self, hidden_states):
 
 
 class JukeboxEncoderConvBlock(nn.Module):
-    def __init__(
-        self,
-        input_emb_width,
-        output_emb_width,
-        down_t,
-        stride_t,
-        width,
-        depth,
-        m_conv,
-        dilation_growth_rate=1,
-        dilation_cycle=None,
-        zero_out=False,
-        res_scale=False,
-    ):
+    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t):
         super().__init__()
         blocks = []
         filter_t = stride_t * 2
         pad_t = stride_t // 2
         if down_t > 0:
             for i in range(down_t):
-                blocks.append(nn.Conv1d(input_emb_width if i == 0 else width, width, filter_t, stride_t, pad_t))
                 blocks.append(
-                    JukeboxResnet1D(width, depth, m_conv, dilation_growth_rate, dilation_cycle, zero_out, res_scale)
+                    nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, filter_t, stride_t, pad_t)
                 )
-        self.proj_out = nn.Conv1d(width, output_emb_width, 3, 1, 1)
+                blocks.append(JukeboxResnet1D(config, hidden_dim, depth))
+        self.proj_out = nn.Conv1d(hidden_dim, config.embed_dim, 3, 1, 1)
         self.downsample_block = nn.ModuleList(blocks)
 
     def forward(self, hidden_states):
@@ -329,82 +297,15 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class JukeboxDecoderConvBock(nn.Module):
-    def __init__(
-        self,
-        input_emb_width,
-        output_emb_width,
-        down_t,
-        stride_t,
-        width,
-        depth,
-        m_conv,
-        dilation_growth_rate=1,
-        dilation_cycle=None,
-        zero_out=False,
-        res_scale=False,
-        reverse_decoder_dilation=False,
-    ):
-        super().__init__()
-        blocks = []
-        if down_t > 0:
-            filter_t, pad_t = stride_t * 2, stride_t // 2
-            self.proj_in = nn.Conv1d(output_emb_width, width, 3, 1, 1)
-            for i in range(down_t):
-                blocks.append(
-                    JukeboxResnet1D(
-                        width,
-                        depth,
-                        m_conv,
-                        dilation_growth_rate,
-                        dilation_cycle,
-                        zero_out=zero_out,
-                        res_scale=res_scale,
-                        reverse_dilation=reverse_decoder_dilation,
-                    )
-                )
-                blocks.append(
-                    nn.ConvTranspose1d(
-                        width, input_emb_width if i == (down_t - 1) else width, filter_t, stride_t, pad_t
-                    )
-                )
-
-        self.upsample_block = nn.ModuleList(blocks)
-
-    def forward(self, hidden_states):
-        hidden_states = self.proj_in(hidden_states)
-        for block in self.upsample_block:
-            hidden_states = block(hidden_states)
-        return hidden_states
-
-
 class JukeboxEncoder(nn.Module):
-    def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t, **block_kwargs):
+    def __init__(self, config, width, depth, levels, downs_t, strides_t):
         super().__init__()
-        self.input_emb_width = input_emb_width
-        self.output_emb_width = output_emb_width
         self.levels = levels
-        self.downs_t = downs_t
-        self.strides_t = strides_t
-
-        block_kwargs_copy = dict(**block_kwargs)
-        if "reverse_decoder_dilation" in block_kwargs_copy:
-            del block_kwargs_copy["reverse_decoder_dilation"]
-
-        def level_block(level, down_t, stride_t):
-            return JukeboxEncoderConvBlock(
-                input_emb_width if level == 0 else output_emb_width,
-                output_emb_width,
-                down_t,
-                stride_t,
-                **block_kwargs_copy,
-            )
-
         self.level_blocks = nn.ModuleList()
 
         iterator = zip(list(range(self.levels)), downs_t, strides_t)
-        for level, down_t, stride_t in iterator:
-            self.level_blocks.append(level_block(level, down_t, stride_t))
+        for i, down_t, stride_t in iterator:
+            self.level_blocks.append(JukeboxEncoderConvBlock(config, config.conv_input_shape if i == 0 else config.embed_dim, width, depth, down_t, stride_t))
 
     def forward(self, hidden_states):
         all_hidden_states = []
@@ -418,24 +319,37 @@ def forward(self, hidden_states):
         return all_hidden_states
 
 
-class JukeboxDecoder(nn.Module):
-    def __init__(self, input_emb_width, output_emb_width, levels, downs_t, strides_t, **block_kwargs):
+class JukeboxDecoderConvBock(nn.Module):
+    def __init__(self,config,embed_dim,hidden_dim,depth,down_t,stride_t):
+        self.embed_dim = embed_dim
+        self.hidden_dim = hidden_dim
         super().__init__()
-        self.input_emb_width = input_emb_width
-        self.output_emb_width = output_emb_width
-        self.levels = levels
-        self.downs_t = downs_t
-        self.strides_t = strides_t
+        blocks = []
+        if down_t > 0:
+            filter_t = stride_t * 2
+            pad_t = stride_t // 2
+            self.proj_in = nn.Conv1d(embed_dim, hidden_dim, 3, 1, 1)
+            for i in range(down_t):
+                blocks.append(JukeboxResnet1D(config, hidden_dim, depth, reverse_dilation=True))
+                blocks.append(nn.ConvTranspose1d(hidden_dim, hidden_dim if i<down_t-1 else embed_dim, filter_t, stride_t, pad_t))
+        self.upsample_block = nn.ModuleList(blocks)
 
-        def level_block(level, down_t, stride_t):
-            return JukeboxDecoderConvBock(output_emb_width, output_emb_width, down_t, stride_t, **block_kwargs)
+    def forward(self, hidden_states):
+        hidden_states = self.proj_in(hidden_states)
+        for block in self.upsample_block:
+            hidden_states = block(hidden_states)
+        return hidden_states
 
+
+class JukeboxDecoder(nn.Module):
+    def __init__(self, config, hidden_dim, depth, levels, downs_t, strides_t):
+        super().__init__()
+        self.levels = levels
         self.level_blocks = nn.ModuleList()
-        iterator = zip(list(range(self.levels)), downs_t, strides_t)
-        for level, down_t, stride_t in iterator:
-            self.level_blocks.append(level_block(level, down_t, stride_t))
+        for level, down_t, stride_t in zip(list(range(self.levels)), downs_t, strides_t):
+            self.level_blocks.append(JukeboxDecoderConvBock(config, config.embed_dim, hidden_dim, depth, down_t, stride_t))
 
-        self.out = nn.Conv1d(output_emb_width, input_emb_width, 3, 1, 1)
+        self.out = nn.Conv1d(config.embed_dim, config.conv_input_shape, 3, 1, 1)
 
     def forward(self, hidden_states, all_levels=True):
         hidden_state = hidden_states[-1]
@@ -453,11 +367,11 @@ def forward(self, hidden_states, all_levels=True):
 
 
 class JukeboxBottleneckBlock(nn.Module):
-    def __init__(self, codebook_dim, codebook_width, mu):
+    def __init__(self, config: JukeboxVQVAEConfig):
         super().__init__()
-        self.codebook_dim = codebook_dim
-        self.codebook_width = codebook_width
-        self.mu = mu
+        self.codebook_dim = config.codebook_dimension
+        self.codebook_width = config.embed_dim
+        self.mu = config.lmu
         self.threshold = 1.0
         self.init = False
         self.codebook_sum = None
@@ -537,9 +451,9 @@ def preprocess(self, hidden_states):
 
     def postprocess(self, latent_states, dequantised_states, x_shape):
         # [NT, C] -> NTC -> NCT
-        batch_size, T = x_shape
-        dequantised_states = dequantised_states.view(batch_size, T, -1).permute(0, 2, 1).contiguous()
-        latent_states = latent_states.view(batch_size, T)
+        batch_size, time = x_shape
+        dequantised_states = dequantised_states.view(batch_size, time, -1).permute(0, 2, 1).contiguous()
+        latent_states = latent_states.view(batch_size, time)
         return latent_states, dequantised_states
 
     def quantise(self, latent_states):
@@ -549,7 +463,7 @@ def quantise(self, latent_states):
             torch.sum(latent_states**2, dim=-1, keepdim=True)
             - 2 * torch.matmul(latent_states, codebook_weights)
             + torch.sum(codebook_weights**2, dim=0, keepdim=True)
-        )  # (batch_size *L, b)
+        )  # (batch_size * latent_states , codebook_weights) # better help from this comment
         min_distance, music_tokens = torch.min(distance, dim=-1)
         fit = torch.mean(min_distance)
         return music_tokens, fit
@@ -615,12 +529,12 @@ def forward(self, hidden_states, update_codebook=True):
 
 
 class JukeboxBottleneck(nn.Module):
-    def __init__(self, codebook_dim, codebook_width, mu, levels):
+    def __init__(self, config, levels):
         super().__init__()
         self.levels = levels
         self.level_blocks = nn.ModuleList()
         for level in range(self.levels):
-            self.level_blocks.append(JukeboxBottleneckBlock(codebook_dim, codebook_width, mu))
+            self.level_blocks.append(JukeboxBottleneckBlock(config))
 
     def encode(self, raw_audio):
         music_tokens = [
@@ -683,82 +597,44 @@ def forward(self, input_audio):
 class JukeboxVQVAE(PreTrainedModel):
     config_class = JukeboxConfig
 
-    def __init__(self, config):
+    def __init__(self, config: JukeboxVQVAEConfig):
         super().__init__(config)
+        downs_t = config.res_downs_t
+        strides_t = config.res_strides_t
         if not config.sample_length:
-            downsamples = [stride**down for stride, down in zip(config.vqvae_strides_t, config.vqvae_down_t)]
+            downsamples = [stride**down for stride, down in zip(strides_t, downs_t)]
             top_raw_to_tokens = np.prod(downsamples)
             config.sample_length = (
                 config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens
             ) * top_raw_to_tokens
             config.sample_length = config.sample_length.astype(int)
 
-        input_shape = (config.sample_length, 1)
-        block_kwargs = dict(
-            width=config.vqvae_conv_block_width,
-            depth=config.vqvae_conv_block_depth,
-            m_conv=config.vqvae_m_conv,
-            dilation_growth_rate=config.vqvae_dilation_growth_rate,
-            dilation_cycle=config.vqvae_dilation_cycle,
-            reverse_decoder_dilation=config.vqvae_reverse_decoder_dilation,
-        )
-
-        multipliers = config.vqvae_multipliers
-        codebook_width = config.vqvae_emmbedding_width
-
-        self.downs_t = downs_t = config.vqvae_downs_t
-        self.strides_t = strides_t = config.vqvae_strides_t
-        self.codebook_dim = codebook_dim = config.vqvae_codebook_dimension
-        self.commit = config.vqvae_commit
-
-        self.sample_length = input_shape[0]
-        x_shape = input_shape[:-1]
-        x_channels = input_shape[-1]
-        self.x_shape = x_shape
+        self.codebook_dim = config.codebook_dimension
+        self.commit = config.commit
+        self.sample_length = config.sample_length
 
         self.downsamples = [stride**down for stride, down in zip(strides_t, downs_t)]
         self.hop_lengths = np.cumprod(self.downsamples)
-        self.levels = levels = config.vqvae_levels
-        self.music_tokens_shapes = [(int(x_shape[0] // self.hop_lengths[-level - 1]),) for level in range(levels)]
-
-        if multipliers is None:
-            self.multipliers = [1] * levels
-        else:
-            self.multipliers = multipliers
-
-        def _block_kwargs(level):
-            this_block_kwargs = dict(block_kwargs)
-            this_block_kwargs["width"] *= self.multipliers[level]
-            this_block_kwargs["depth"] *= self.multipliers[level]
-            return this_block_kwargs
-
-        def encoder(level):
-            return JukeboxEncoder(
-                x_channels,
-                codebook_width,
-                level + 1,
-                downs_t[: level + 1],
-                strides_t[: level + 1],
-                **_block_kwargs(level),
-            )
+        self.levels = levels = config.levels
+        self.music_tokens_shapes = [
+            (int(self.sample_length // self.hop_lengths[-level - 1])) for level in range(levels)
+        ]
 
-        def decoder(level):
-            return JukeboxDecoder(
-                x_channels,
-                codebook_width,
-                level + 1,
-                downs_t[: level + 1],
-                strides_t[: level + 1],
-                **_block_kwargs(level),
-            )
+        self.multipliers = config.multipliers if config.multipliers is not None else [1] * levels
 
         self.encoders = nn.ModuleList()
         self.decoders = nn.ModuleList()
         for level in range(levels):
-            self.encoders.append(encoder(level))
-            self.decoders.append(decoder(level))
+            width = config.res_conv_width * self.multipliers[level]
+            depth = config.res_conv_depth * self.multipliers[level]
+            self.encoders.append(
+                JukeboxEncoder(config, width, depth, level + 1, downs_t[: level + 1], strides_t[: level + 1])
+            )
+            self.decoders.append(
+                JukeboxDecoder(config, width, depth, level + 1, downs_t[: level + 1], strides_t[: level + 1])
+            )
 
-        self.bottleneck = JukeboxBottleneck(codebook_dim, codebook_width, config.vqvae_lmu, levels)
+        self.bottleneck = JukeboxBottleneck(config, levels)
 
     def _decode(self, music_tokens, start_level=0, end_level=None):
         # Decode
@@ -887,13 +763,16 @@ def forward(self, raw_audio):
 
 
 class JukeboxMLP(nn.Module):
-    def __init__(self, width, hidden_dim, resid_dropout=0.0, afn="gelu", zero_out=False, init_scale=1.0):
+    def __init__(self, config):
         # a single channel is always used in original code
         super().__init__()
-        self.c_fc = JukeboxConv1D(width, hidden_dim)
-        self.c_proj = JukeboxConv1D(hidden_dim, width, zero_out)
-        self.act = ACT2FN[afn]
-        self.dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
+        embed_dim = config.width
+        hidden_dim = int(config.mlp_multiplier * embed_dim)
+
+        self.c_fc = JukeboxConv1D(embed_dim, hidden_dim)
+        self.c_proj = JukeboxConv1D(hidden_dim, embed_dim)
+        self.act = ACT2FN[config.act_fn]
+        self.dropout = nn.Dropout(config.resid_dropout) if config.resid_dropout > 0.0 else lambda x: x
 
     def forward(self, hidden_states):
         hidden_states = self.c_fc(hidden_states)
@@ -917,75 +796,69 @@ def forward(self, input):
 
 
 class JukeboxAttention(nn.Module):
+
     def __init__(
         self,
-        width,
+        config,
         n_ctx,
-        hidden_dim,
-        num_heads,
-        attn_dropout=0.0,
-        resid_dropout=0.0,
-        scale=True,
-        mask=False,
-        zero_out=False,
-        init_scale=1.0,
-        attn_func=0,
-        blocks=None,
-        spread=None,
-        encoder_dims=None,
-        lyric_enc_len=None,
+        attn_func="dense_attn",
+        encoder_len=None,
     ):
         super().__init__()
-        self.width = width  # should have a better name
+        self.embed_dim = config.width
+        self.n_heads = config.n_heads
+        self.dropout = config.attn_dropout
+        hidden_dim = int(config.attention_multiplier * self.embed_dim)
+
+        self.head_dim = hidden_dim // config.n_heads
         self.n_ctx = n_ctx  # NOTE: n_ctx could be different within operations. This is complete n_ctx
         self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-        self.scale = scale
-        self.mask = mask
-        if attn_func == 6:
-            self.c_attn = JukeboxConv1D(width, hidden_dim)
-            self.c_enc_kv = JukeboxConv1D(width, hidden_dim * 2)
+        self.scale = self.head_dim**-0.25 #TODO check  1.0 / math.sqrt(math.sqrt(self.hidden_dim // self.n_heads))
+        self.mask = config.mask
+
+        if attn_func == "cross_attention":
+            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim ) # issue here, for single enc decoder different
+            self.c_enc_kv = JukeboxConv1D(self.embed_dim, hidden_dim * 2)
         else:
-            self.c_attn = JukeboxConv1D(width, hidden_dim * 3)
-        self.c_proj = JukeboxConv1D(hidden_dim, width, zero_out)
-        self.attn_dropout = nn.Dropout(attn_dropout) if attn_dropout > 0.0 else lambda x: x
-        self.resid_dropout = nn.Dropout(resid_dropout) if resid_dropout > 0.0 else lambda x: x
+            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim * 3)
+
+        self.c_proj = JukeboxConv1D(hidden_dim, self.embed_dim)
+        self.attn_dropout = nn.Dropout(config.attn_dropout) if config.attn_dropout > 0.0 else lambda x: x
+        self.resid_dropout = nn.Dropout(config.resid_dropout) if config.attn_dropout > 0.0 else lambda x: x
 
         # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
         self.attn_func = attn_func
-        if attn_func == 6:
+        if attn_func == "cross_attention":
             self.qkv = self.decode_qkv
-        elif attn_func == 7:
+        elif attn_func == "prime_attn":
             self.qkv = self.prime_qkv
         else:
             self.qkv = self.factored_qkv
 
-        self.attn, self.attn_mask = {
-            0: (self.dense_attn, "autoregressive"),  # Attend to all positions
-            1: (self.block_attn, "autoregressive"),  # Attend to your block
-            2: (self.transpose_block_attn, "autoregressive"),  # Attend to transpose block
-            3: (self.prev_block_attn, None),  # Attend to previous block
-            4: (self.summary_attn, "summary"),  # Attend to last position of each block
-            5: (self.summary_spread_attn, "summary"),
-            6: (self.dense_attn, None),
-            7: (self.prime_attn, "prime"),
-        }[
-            attn_func
-        ]  # Attend to last key position of each block
-
-        self.blocks = blocks
-        self.spread = spread
-        if blocks is not None:
-            self.block_ctx = n_ctx // blocks
+        ATTENTION_MAP = {
+            "dense_attn": (self.dense_attn, "autoregressive"),
+            "block_attn": (self.block_attn, "autoregressive"),
+            "transpose_block_attn": (self.transpose_block_attn, "autoregressive"),
+            "prev_block_attn": (self.prev_block_attn, None),
+            "summary_attn": (self.summary_attn, "summary"),
+            "summary_spread_attn": (self.summary_spread_attn, "summary"),
+            "cross_attention": (self.dense_attn, None),
+            "prime_attn": (self.prime_attn, "prime"),
+        }
+        self.attn, self.attn_mask = ATTENTION_MAP[attn_func]
+
+        self.blocks = config.blocks
+        self.spread = config.spread
+        if self.blocks is not None:
+            self.block_ctx = self.n_ctx // self.blocks
 
         self.sample_t = 0
         self.cache = {}
-        self.encoder_dims = encoder_dims
-        self.lyric_enc_len = lyric_enc_len
+        self.encoder_len = encoder_len
         self.record_attn = False
 
     def _attn(self, query_states, key_states, value_states, sample):
-        scale = 1.0 / math.sqrt(math.sqrt(self.hidden_dim // self.num_heads))
+        scale = self.scale
         if self.training:
             attention_weight = torch.matmul(query_states * scale, key_states * scale)
         else:
@@ -1011,9 +884,9 @@ def _attn(self, query_states, key_states, value_states, sample):
         attention_prob = F.softmax(attention_weight, dim=-1).type(attn_weight_type)
         if self.record_attn:
             self.attention_prob = attention_prob
-            if self.attn_func == 7:
+            if self.attn_func == "prime_attn":
                 # only keep music queries and lyrics keys/values
-                self.attention_prob = self.attention_prob[:, :, self.lyric_enc_len :, : self.lyric_enc_len]
+                self.attention_prob = self.attention_prob[:, :, self.encoder_len :, : self.encoder_len]
         attention_prob = self.attn_dropout(attention_prob)
         context_states = torch.matmul(attention_prob, value_states)
         return context_states
@@ -1026,8 +899,8 @@ def merge_heads(self, hidden_states):
     def split_heads(self, hidden_states, k=False):
         new_hidden_states_shape = (
             *hidden_states.size()[:-1],
-            self.num_heads,
-            hidden_states.size(-1) // self.num_heads,
+            self.n_heads,
+            hidden_states.size(-1) // self.n_heads,
         )
         hidden_states = hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct split_states
         if k:
@@ -1114,14 +987,14 @@ def prev_block_attn(self, query, key, value, sample):
             value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
 
             if query_length < seq_len:
-                qb = query_length // block_ctx
-                kb = seq_len // block_ctx
+                nb_query_blocks = query_length // block_ctx
+                nb_key_blocks = seq_len // block_ctx
                 seq_len = query_length
-                key = key.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:]
-                key = key.contiguous().view(batch_size * qb, block_ctx, embed_dim)
+                key = key.view(batch_size, nb_key_blocks, block_ctx, embed_dim)[:, -nb_query_blocks:]
+                key = key.contiguous().view(batch_size * nb_query_blocks, block_ctx, embed_dim)
 
-                value = value.view(batch_size, kb, block_ctx, embed_dim)[:, -qb:]
-                value = value.contiguous().view(batch_size * qb, block_ctx, embed_dim)
+                value = value.view(batch_size, nb_key_blocks, block_ctx, embed_dim)[:, -nb_query_blocks:]
+                value = value.contiguous().view(batch_size * nb_query_blocks, block_ctx, embed_dim)
 
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
@@ -1163,15 +1036,15 @@ def summary_spread_attn(self, query, key, value, sample):
             return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
 
     def prime_attn(self, query, key, value, sample):
-        lyric_enc_len = self._lyric_enc_len
-        key = key[:, :lyric_enc_len]
-        value = value[:, :lyric_enc_len]
+        encoder_len = self._encoder_len
+        key = key[:, :encoder_len]
+        value = value[:, :encoder_len]
         return self.dense_attn(query, key, value, sample)
 
-    def factored_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
+    def factored_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
-        if lyric_encoder_states is not None:
-            raise TypeError("lyric_encoder_states should be None")
+        if last_encoder_hidden_states is not None:
+            raise TypeError("last_encoder_hidden_states should be None")
 
         query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
@@ -1181,7 +1054,7 @@ def factored_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
             if self._cache_len() > l_cache:
                 self._slice_cache(-l_cache)
             if curr_ctx > 1:
-                if self.attn_func != 0:
+                if self.attn_func != "dense_attn":
                     query = self._pad_to_block_ctx(query, query=True)
                     key = self._pad_to_block_ctx(key)
                     value = self._pad_to_block_ctx(value)
@@ -1191,53 +1064,55 @@ def factored_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
                 value = self.cache["value"]
         return query, key, value, sample
 
-    def prime_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
+    def prime_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
-        if lyric_encoder_states is not None:
-            raise TypeError("lyric_encoder_states should be None")
+        if last_encoder_hidden_states is not None:
+            raise TypeError("last_encoder_hidden_states should be None")
         query, key, value = hidden_states.chunk(3, dim=2)
         if sample:
-            if self._cache_len() < self._lyric_enc_len:
+            if self._cache_len() < self._encoder_len:
                 self._append_cache(key, value)
-            if self._cache_len() > self._lyric_enc_len:
-                self._slice_cache(0, self._lyric_enc_len)
+            if self._cache_len() > self._encoder_len:
+                self._slice_cache(0, self._encoder_len)
             key, value = self.cache["key"], self.cache["value"]
             self.sample_t += curr_ctx
         return query, key, value, sample
 
-    def decode_qkv(self, hidden_states, lyric_encoder_states=None, sample=False):
+    def decode_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
         query = hidden_states
         if sample:
             if self.sample_t == 0:
                 self.cache["key"], self.cache["value"] = self.c_enc_kv(
-                    lyric_encoder_states.type_as(hidden_states)
+                    last_encoder_hidden_states.type_as(hidden_states)
                 ).chunk(2, dim=2)
             key, value = self.cache["key"], self.cache["value"]
             self.sample_t += curr_ctx
         else:
-            key, value = self.c_enc_kv(lyric_encoder_states.type_as(hidden_states)).chunk(2, dim=2)
+            key, value = self.c_enc_kv(last_encoder_hidden_states.type_as(hidden_states)).chunk(2, dim=2)
         return query, key, value, sample
 
-    def forward(self, hidden_states, lyric_encoder_states=None, sample=False):
+    def forward(self, hidden_states, last_encoder_hidden_states=None, sample=False):
         curr_ctx = hidden_states.shape[1]
         hidden_states = self.c_attn(hidden_states)
-        query, key, value, sample = self.qkv(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=sample)
-        a = self.attn(query, key, value, sample)
-        if a.shape[1] != curr_ctx:
+        query, key, value, sample = self.qkv(
+            hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=sample
+        )
+        attention_scores = self.attn(query, key, value, sample)
+        if attention_scores.shape[1] != curr_ctx:
             offset = self._offset(curr_ctx)
-            a = a[:, offset : offset + curr_ctx, :].contiguous()
-        a = self.c_proj(a)
-        return self.resid_dropout(a)
+            attention_scores = attention_scores[:, offset : offset + curr_ctx, :].contiguous()
+        attention_scores = self.c_proj(attention_scores)
+        return self.resid_dropout(attention_scores)
 
     @property
-    def _lyric_enc_len(self):
-        lyric_enc_len = self.lyric_enc_len
-        lyric_enc_blocks = (lyric_enc_len // self.blocks) + 1
-        return lyric_enc_blocks * self.blocks
+    def _encoder_len(self):
+        encoder_len = self.encoder_len
+        encoder_blocks = (encoder_len // self.blocks) + 1
+        return encoder_blocks * self.blocks
 
     def _offset(self, curr_ctx):
-        if self.attn_func == 0:
+        if self.attn_func == "dense_attn":
             return 0
         return (self.sample_t - curr_ctx) % self.block_ctx
 
@@ -1260,26 +1135,37 @@ def _suff_cache_len(self):
             key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
             location in the context.
         """
-        if self.attn_func == 0:
+        if self.attn_func == "dense_attn":
             return self.sample_t
-        elif self.attn_func == 1:
+        elif self.attn_func == "block_attn":
             return (self.sample_t - 1) % self.block_ctx + 1
-        elif self.attn_func == 2:
+        elif self.attn_func == "transpose_block_attn":
             return self.sample_t
-        elif self.attn_func == 3:
+        elif self.attn_func == "prev_block_attn":
             if self.sample_t <= self.block_ctx:
                 return self.sample_t
             else:
                 curr_block = (self.sample_t - 1) % self.block_ctx + 1
                 prev_block = self.block_ctx
                 return curr_block + prev_block
-        elif self.attn_func == 6:
+        elif self.attn_func == "cross_attn":
             return self.encoder_dims
-        elif self.attn_func == 7:
+        elif self.attn_func == "prime_attn":
             return min(self.sample_t, self._lyric_enc_len)
         else:
             raise NotImplementedError()
 
+        REQUIRED_CACHE_LEN = {
+            "dense_attn":self.sample_t,
+            "block_attn":(self.sample_t - 1) % self.block_ctx + 1,
+            "transpose_block_attn":self.sample_t,
+            "prev_block_attn": self.sample_t if self.sample_t <= self.block_ctx  else (self.sample_t - 1) % self.block_ctx + 1 + self.block_ctx,
+            "cross_attn":self.encoder_len,
+            "prime_attn":min(self.sample_t, self._encoder_len)
+        }
+
+        return REQUIRED_CACHE_LEN[self.attn_func]
+
     def _slice_cache(self, start, end=None):
         self.cache["key"] = self.cache["key"][:, start:end]
         self.cache["value"] = self.cache["value"][:, start:end]
@@ -1312,63 +1198,30 @@ def del_cache(self):
 class JukeboxBlock(nn.Module):
     def __init__(
         self,
-        width,
+        config,
         n_ctx,
-        num_heads,
-        attn_dropout=0.0,
-        resid_dropout=0.0,
-        afn="gelu",
-        scale=True,
-        mask=False,
-        zero_out=False,
-        init_scale=1.0,
-        res_scale=1.0,
-        m_attn=0.25,
-        m_mlp=1.0,
-        attn_func=0,
-        blocks=None,
-        spread=None,
-        encoder_dims=None,
-        lyric_enc_len=None,
+        attn_func="dense_attn",
+        encoder_len=None,
     ):
         super().__init__()
+        self.width = config.width
         self.attn = JukeboxAttention(
-            width=width,
+            config=config,
             n_ctx=n_ctx,
-            hidden_dim=int(m_attn * width),
-            num_heads=num_heads,
-            attn_dropout=attn_dropout,
-            resid_dropout=resid_dropout,
-            scale=scale,
-            mask=mask,
-            zero_out=zero_out,
-            init_scale=init_scale,
             attn_func=attn_func,
-            blocks=blocks,
-            spread=spread,
-            encoder_dims=encoder_dims,
-            lyric_enc_len=lyric_enc_len,
+            encoder_len=encoder_len,
         )
 
-        self.layer_norm_0 = JukeboxLayerNorm(width)
-        self.mlp = JukeboxMLP(
-            width=width,
-            hidden_dim=int(m_mlp * width),
-            resid_dropout=resid_dropout,
-            afn=afn,
-            zero_out=zero_out,
-            init_scale=init_scale,
-        )
-        self.layer_norm_1 = JukeboxLayerNorm(width)
-        self.res_scale = res_scale
-
-        self.width = width
+        self.layer_norm_0 = JukeboxLayerNorm(config.width)
+        self.mlp = JukeboxMLP(config)
+        self.layer_norm_1 = JukeboxLayerNorm(config.width)
+        self.res_scale = 1.0 / config.depth if config.res_scale else 1.0
         self.attn_func = attn_func
 
-    def forward(self, hidden_states, lyric_encoder_states, sample=False):
+    def forward(self, hidden_states, last_encoder_hidden_states, sample=False):
         residuals = hidden_states
         hidden_states = self.layer_norm_0(hidden_states)
-        hidden_states = self.attn(hidden_states, lyric_encoder_states, sample)
+        hidden_states = self.attn(hidden_states, last_encoder_hidden_states, sample)
 
         output_states = self.layer_norm_1(residuals + hidden_states)
         output_states = self.mlp(output_states)
@@ -1379,95 +1232,35 @@ def forward(self, hidden_states, lyric_encoder_states, sample=False):
         return output
 
 
-class JukeboxTransformer(nn.Module):
+class JukeboxLayerStack(nn.Module):
     def __init__(
         self,
-        width,
+        config,
         n_ctx,
-        num_heads,
-        n_depth,
-        attn_dropout=0.0,
-        resid_dropout=0.0,
-        afn="gelu",
-        scale=True,
-        mask=False,
-        zero_out=False,
-        init_scale=1.0,
-        res_scale=False,
-        m_attn=0.25,
-        m_mlp=1.0,
-        attn_order=0,
-        blocks=None,
-        spread=None,
-        encoder_dims=None,
-        lyric_enc_len=None,
+        encoder_len=None,
     ):
         super().__init__()
-        self.width = width
         self.n_ctx = n_ctx
-        self.encoder_dims = encoder_dims
-        self.blocks = blocks
-        if blocks is not None:
-            self.block_ctx = n_ctx // blocks
-        self.lyric_enc_len = lyric_enc_len
-        self.num_heads = num_heads
+        self.width = config.width
+        self.depth = config.depth
+        self.blocks = config.blocks
+        self.attention_pattern = config.attention_pattern
+        if self.blocks is not None:
+            self.block_ctx = n_ctx // self.blocks
+        self.encoder_len = encoder_len
+        self.n_heads = config.n_heads
 
-        res_scale = 1.0 / n_depth if res_scale else 1.0
 
         # Orders of attn_func
-        attn_func = self.get_attn_func(attn_order)
-
-        def attn_block(depth):
-            return JukeboxBlock(
-                width=width,
-                n_ctx=n_ctx,
-                num_heads=num_heads,
-                attn_dropout=attn_dropout,
-                resid_dropout=resid_dropout,
-                afn=afn,
-                scale=scale,
-                mask=mask,
-                zero_out=zero_out if attn_func(depth) != 6 else True,
-                init_scale=init_scale,
-                res_scale=res_scale,
-                m_attn=m_attn,
-                m_mlp=m_mlp,
-                attn_func=attn_func(depth),
-                blocks=blocks,
-                spread=spread,
-                encoder_dims=encoder_dims,
-                lyric_enc_len=lyric_enc_len,
-            )
-
+        attention_pattern = ATTENTION_PATTERNS[self.attention_pattern]
         self._attn_mods = nn.ModuleList()
-        for depth in range(n_depth):
-            self._attn_mods.append(attn_block(depth))
+        for depth in range(self.depth):
+            self._attn_mods.append(
+                JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth),encoder_len=encoder_len)
+            )
 
         self.saved_attn_weights = []
 
-    def get_attn_func(self, attn_order: int):
-        """
-        Get the correct attention order pattern.
-        """
-        mapping = {
-            0: lambda layer: 0,
-            1: lambda layer: [1, 2][layer % 2],
-            2: lambda layer: [1, 2, 3][layer % 3],  # Alternate row, column and previous row attn
-            3: lambda layer: [1, 4][layer % 2],  # Alternate row and last column
-            4: lambda layer: [1, 5][layer % 2],  # Alternate row and last k columns
-            5: lambda layer: [1, 4, 1, 1][layer % 4],  # Alternate row, last column, row, row
-            6: lambda layer: [1, 2, 3, 6][layer % 4],
-            7: lambda layer: [*[1, 2, 3] * 5, 6][layer % 16],
-            8: lambda layer: [1, 2, 3, 1, 2, 3, 1, 2, 3, 6][layer % 10],  # Used by separated_enc_dec model with lyrics
-            9: lambda layer: [1, 2, 3, 0][layer % 4],
-            # Used by large separated_enc_dec model with lyrics
-            10: lambda layer: [*[1, 2, 3, 1, 2, 3, 1, 2, 3], *[1, 2, 3, 1, 2, 3, 1, 2, 3, 6] * 7][layer % 79],
-            11: lambda layer: [6, 6, 0][layer % 3] if layer % 16 == 15 else [1, 2, 3][layer % 3],
-            # Used by single_enc_dec model with lyrics
-            12: lambda layer: [7, 7, 0][layer % 3] if layer % 16 == 15 else [1, 2, 3][layer % 3],
-        }
-
-        return mapping[attn_order]
 
     def set_record_attn(self, record_attn):
         """
@@ -1488,13 +1281,15 @@ def _should_record_attn(layer_idx):
         if not record_attn:
             self.saved_attn_weights = []
 
-    def forward(self, hidden_states, lyric_encoder_states=None, sample=False):
+    def forward(self, hidden_states, last_encoder_hidden_states=None, sample=False):
         # Blocks
         for i, attn_layer in enumerate(self._attn_mods):
-            if attn_layer.attn_func == 6:  # attend to the lyrics
-                hidden_states = attn_layer(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=sample)
+            if attn_layer.attn_func == "cross_attention":  # attend to the lyrics
+                hidden_states = attn_layer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=sample
+                )
             else:
-                hidden_states = attn_layer(hidden_states, lyric_encoder_states=None, sample=sample)
+                hidden_states = attn_layer(hidden_states, last_encoder_hidden_states=None, sample=sample)
             if attn_layer.attn.record_attn:
                 self.saved_attn_weights.append(attn_layer.attn.c_attn.weight)
         return hidden_states
@@ -1505,12 +1300,10 @@ def del_cache(self):
 
 
 class JukeboxPositionalEmbedding(nn.Module):
-    def __init__(self, input_shape, width, init_scale=1.0):
+    def __init__(self, embed_dim, width):
         super().__init__()
-        self.input_shape = input_shape
-        self.input_dims = np.prod(input_shape)
-        self.pos_emb = nn.Parameter(torch.empty((self.input_dims, width)))
-        nn.init.normal_(self.pos_emb, std=0.01 * init_scale)
+        self.pos_emb = nn.Parameter(torch.empty((embed_dim, width)))
+        # nn.init.normal_(self.pos_emb, std=0.01 * init_scale)
 
     def forward(self):
         pos_emb = self.pos_emb
@@ -1520,86 +1313,47 @@ def forward(self):
 class JukeboxConditionalAutoregressive(nn.Module):
     def __init__(
         self,
-        input_shape,
-        embed_dim,
-        width=128,
-        depth=2,
-        heads=1,
-        attn_dropout=0.0,
-        resid_dropout=0.0,
-        emb_dropout=0.0,
-        mask=True,
-        zero_out=False,
-        init_scale=1.0,
-        res_scale=False,
-        m_attn=0.25,
-        m_mlp=1,
-        attn_order=0,
-        blocks=None,
-        spread=None,
+        config,
+        n_ctx = None,
+        embed_dim = None,
         audio_conditioning=False,
         metadata_conditioning=False,
-        encoder_dims=0,
-        only_encode=False,
-        merged_decoder=False,
-        lyric_enc_len=None,
-        afn="quick_gelu",
+        encoder_len=0,
+        is_encoder=False,
     ):
         """
-        - input_shape : respective dimension of the different inputs (lyrics/music_tokens)
         - embed_dim : either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook
         dimension, if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
         for the lyric tokens.
-        - encoder_dims : input dimension of the lyric encoder.
+        The width corresponds to the number of tokens or lyrics tokens provided in a single pass.
+        It can be different from the embed dim.
         - audio_conditioning : whether or not the prior supports conditionning on audio.
         - metadata_conditioning : whether or not the prior supports conditionning on artitst, genres, lyrics and
           timing. When
         False, the start token is random.
-        - lyric_enc_len : for now len of the lyric hidden states
+        - encoder_len : for now len of the lyric hidden states
         """
         super().__init__()
-        self.input_shape = input_shape
-        self.input_dims = input_dims = np.prod(input_shape)
-        self.encoder_dims = encoder_dims
-        self.embed_dim = embed_dim
-        self.width = width
-        self.depth = depth
-
-        self.embed_tokens = nn.Embedding(embed_dim, width)
-        nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
-        self.embed_tokens_dropout = nn.Dropout(emb_dropout)
+        self.width = config.width
+        self.depth = config.depth
+        self.n_ctx = n_ctx if n_ctx is not None else config.n_ctx
+        self.embed_dim = embed_dim if embed_dim is not None else config.embed_dim
+        self.embed_tokens = nn.Embedding(self.embed_dim, config.width)
+        # nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
+        self.embed_tokens_dropout = nn.Dropout(config.emb_dropout)
         self.metadata_conditioning = metadata_conditioning
         self.audio_conditioning = audio_conditioning
         if not metadata_conditioning:
-            self.start_token = nn.Parameter(torch.empty((1, width)))
-            nn.init.normal_(self.start_token, std=0.01 * init_scale)
-        self.pos_emb = JukeboxPositionalEmbedding(input_shape=input_shape, width=width, init_scale=init_scale)
-        self.pos_emb_dropout = nn.Dropout(emb_dropout)
-
-        self.transformer = JukeboxTransformer(
-            width=width,
-            n_ctx=input_dims,
-            num_heads=heads,
-            n_depth=depth,
-            attn_dropout=attn_dropout,
-            resid_dropout=resid_dropout,
-            afn=afn,
-            scale=True,
-            mask=mask,
-            zero_out=zero_out,
-            init_scale=init_scale,
-            res_scale=res_scale,
-            m_attn=m_attn,
-            m_mlp=m_mlp,
-            attn_order=attn_order,
-            blocks=blocks,
-            spread=spread,
-            encoder_dims=encoder_dims,
-            lyric_enc_len=lyric_enc_len,
-        )
-        self.only_encode = only_encode
-        self.lyric_enc_len = lyric_enc_len
-        if merged_decoder:
+            self.start_token = nn.Parameter(torch.empty((1, config.width)))
+            # nn.init.normal_(self.start_token, std=0.01 * init_scale)
+        self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.width)
+        self.pos_emb_dropout = nn.Dropout(config.emb_dropout)
+
+        self.transformer = JukeboxLayerStack(config,n_ctx=self.n_ctx,encoder_len=encoder_len)
+        self.is_encoder = is_encoder
+        self.encoder_len = encoder_len
+
+        if config.merged_decoder:
             # Merged piped model uses this setup
             self.add_cond_after_transformer = False
             self.share_embed_tokens_fc_proj_out = False
@@ -1607,17 +1361,17 @@ def __init__(
             self.add_cond_after_transformer = True
             self.share_embed_tokens_fc_proj_out = True
 
-        if not only_encode:
-            self.fc_proj_out = nn.Linear(width, embed_dim, bias=False)
+        if not is_encoder:
+            self.fc_proj_out = nn.Linear(config.width, self.embed_dim, bias=False)
             if self.share_embed_tokens_fc_proj_out:
                 self.fc_proj_out.weight = self.embed_tokens.weight
             self.loss = torch.nn.CrossEntropyLoss()
-
+            # nn.init.normal_(self.encoder.lm_head.weight, std=0.02 * decoder_config["init_scale"])
     def postprocess(self, tokens, sample_tokens=None):
         # Convert back from NL and long to NHWC
         batch_size = tokens.shape[0]
-        if sample_tokens is None or sample_tokens == self.input_dims:
-            return tokens.view(batch_size, *self.input_shape)
+        if sample_tokens is None or sample_tokens == self.embed_dim:
+            return tokens.view(batch_size, *self.embed_dim)
         else:
             return tokens.view(batch_size, -1)
 
@@ -1626,13 +1380,14 @@ def forward(
         tokens,
         audio_conditioning=None,
         metadata_conditioning=None,
-        lyric_encoder_states=None,
+        last_encoder_hidden_states=None,
         get_preds=False,
         get_acts=False,
         get_sep_loss=False,
     ):
         """
         - tokens : composed of both music tokens and lyrics tokens or just music tokens
+        depending on the `merged_decoder` flag.
         """
         # Preprocess.
         batch_size = tokens.shape[0]
@@ -1660,34 +1415,33 @@ def forward(
             self.embed_tokens_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning
         )  # Pos emb and dropout
 
-        hidden_states = self.transformer(hidden_states, lyric_encoder_states=lyric_encoder_states)  # Transformer
+        hidden_states = self.transformer(
+            hidden_states, last_encoder_hidden_states=last_encoder_hidden_states
+        )  # Transformer
         if self.add_cond_after_transformer:  # Piped doesnt add x_cond
             hidden_states = hidden_states + audio_conditioning
 
-        acts = hidden_states
-        if self.only_encode:
+        activations = hidden_states
+        if self.is_encoder:
             return hidden_states
-        hidden_states = self.fc_proj_out(hidden_states)  # Predictions
 
+        hidden_states = self.fc_proj_out(hidden_states)  # Predictions
+        loss_fn = nn.CrossEntropyLoss()
         if get_sep_loss:
-            lyric_hidden_states = hidden_states[:, : self.lyric_enc_len].reshape(-1, self.embed_dim)
-            token_hidden_states = hidden_states[:, self.lyric_enc_len :].reshape(-1, self.embed_dim)
+            lyric_hidden_states = hidden_states[:, : self.encoder_len].reshape(-1, self.embed_dim)
+            token_hidden_states = hidden_states[:, self.encoder_len :].reshape(-1, self.embed_dim)
 
-            lyric_loss = F.cross_entropy(lyric_hidden_states, target[:, : self.lyric_enc_len].reshape(-1)) / np.log(
-                2.0
-            )
-            music_token_loss = F.cross_entropy(
-                token_hidden_states, target[:, self.lyric_enc_len :].reshape(-1)
-            ) / np.log(2.0)
+            lyric_loss = loss_fn(lyric_hidden_states, target[:, : self.encoder_len].reshape(-1)) / np.log(2.0)
+            music_token_loss = loss_fn(token_hidden_states, target[:, self.encoder_len :].reshape(-1)) / np.log(2.0)
 
             loss = (lyric_loss, music_token_loss)  # Note order! Lyric is first
         else:
-            loss = F.cross_entropy(hidden_states.view(-1, self.embed_dim), target.view(-1)) / np.log(2.0)  # Loss
+            loss = loss_fn(hidden_states.view(-1, self.embed_dim), target.view(-1)) / np.log(2.0)  # Loss
 
         if get_preds:
             return loss, hidden_states
         elif get_acts:
-            return loss, acts
+            return loss, activations
         else:
             return loss, None
 
@@ -1702,7 +1456,7 @@ def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_cond
                 hidden_states[:, 0] = self.start_token
         else:
             hidden_states = self.embed_tokens(tokens)
-        if audio_conditioning.shape == (n_samples, self.input_dims, self.width):
+        if audio_conditioning.shape == (n_samples, self.n_ctx, self.width):
             cond = audio_conditioning[:, sample_t : sample_t + 1, :]
         else:
             cond = audio_conditioning
@@ -1711,12 +1465,13 @@ def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_cond
         )  # Pos emb, dropout is identity at eval time
         return hidden_states, cond
 
+    # Could this be made compatible with generate
     def sample(
         self,
         n_samples,
         audio_conditioning=None,
         metadata_conditioning=None,
-        lyric_encoder_states=None,
+        last_encoder_hidden_states=None,
         temp=1.0,
         top_k=0,
         top_p=0.0,
@@ -1724,15 +1479,16 @@ def sample(
         sample_tokens=None,
     ):
         if sample_tokens is None:
-            sample_tokens = self.input_dims
+            sample_tokens = self.n_ctx
 
         if not self.audio_conditioning:
             audio_conditioning = torch.zeros(
                 (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
-            ).to("cpu" if torch.cuda.is_available() else "cpu")
+            ).to(self.fc_proj_out.device)
 
         with torch.no_grad():
-            sampled_tokens, tokens = [], None
+            sampled_tokens = []
+            tokens = None
             if get_preds:
                 preds = []
 
@@ -1743,7 +1499,9 @@ def sample(
                     sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning
                 )
 
-                hidden_states = self.transformer(hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True)
+                hidden_states = self.transformer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True
+                )
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
                 hidden_states = self.fc_proj_out(hidden_states)  # Predictions
@@ -1779,7 +1537,7 @@ def primed_sample(
         music_tokens,
         audio_conditioning=None,
         metadata_conditioning=None,
-        lyric_encoder_states=None,
+        last_encoder_hidden_states=None,
         temp=1.0,
         top_k=0,
         top_p=0.0,
@@ -1788,7 +1546,7 @@ def primed_sample(
         sample_tokens=None,
     ):
         if sample_tokens is None:
-            sample_tokens = self.input_dims
+            sample_tokens = self.embed_dim
         # Preprocess.
         batch_size = music_tokens.shape[0]
         with torch.no_grad():
@@ -1830,7 +1588,7 @@ def primed_sample(
                 del conds_prime
                 if not get_preds:
                     del cond_prime
-                x_prime = self.transformer(x_prime, lyric_encoder_states=lyric_encoder_states, sample=True)
+                x_prime = self.transformer(x_prime, last_encoder_hidden_states=last_encoder_hidden_states, sample=True)
 
                 if get_preds:
                     if self.add_cond_after_transformer:
@@ -1855,7 +1613,7 @@ def primed_sample(
                 )
 
                 hidden_states = self.transformer(
-                    hidden_states, lyric_encoder_states=lyric_encoder_states, sample=True
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True
                 )  # Transformer
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
@@ -1892,19 +1650,17 @@ class JukeboxMusicTokenConditioner(nn.Module):
 
     """
 
-    def __init__(
-        self, input_shape, embed_dim, down_t, stride_t, out_width, init_scale, zero_out, res_scale, **block_kwargs
-    ):
+    def __init__(self, config, level):
+
         super().__init__()
-        self.width = out_width
-        self.embed_tokens = nn.Embedding(embed_dim, out_width)
-        nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
+        self.embed_tokens = nn.Embedding(config.embed_dim, config.width)
+
 
         # JukeboxMusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level?
         self.upsampler = JukeboxDecoderConvBock(
-            self.width, self.width, down_t, stride_t, **block_kwargs, zero_out=zero_out, res_scale=res_scale
+            config, config.width, config.res_conv_width, config.res_conv_depth, config.res_downs_t[level], config.res_strides_t[level]
         )
-        self.layer_norm = JukeboxLayerNorm(self.width)
+        self.layer_norm = JukeboxLayerNorm(config.width)
 
     def forward(self, music_tokens, raw_audio_conditionning=None):
         """
@@ -1949,12 +1705,12 @@ class JukeboxRangeEmbedding(nn.Module):
     # [start,end) mapped to [0,1,...,bins-1]
     # [start,end) -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1]
     # NOTE: Open ended interval on right, so start <= pos < end, not <= end
-    def __init__(self, n_time, embed_dim, range, out_width, init_scale, clamp=False):
+    def __init__(self, n_time, embed_dim, range, out_width, clamp=False):
         super().__init__()
         self.n_time = n_time
         self.embed_dim = embed_dim
         self.emb = nn.Embedding(embed_dim, out_width)
-        nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
+        # TODO add to init_weights nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
         self.pos_min, self.pos_max = range
         self.clamp = clamp
 
@@ -1990,39 +1746,31 @@ def forward(self, pos_start, pos_end=None):
 
 
 class LabelConditioner(nn.Module):
-    def __init__(
-        self,
-        metadata_dims,
-        timing_dims,
-        sampling_rate,
-        min_duration,
-        max_duration,
-        n_time,
-        out_width,
-        init_scale,
-        max_nb_genres,
-        include_time_signal,
-    ):
+    def __init__(self, config, include_time_signal):
         super().__init__()
-        self.n_time = n_time
-        self.out_width = out_width
-        nb_genres, nb_artists = metadata_dims
-        self.max_nb_genres = max_nb_genres
-        self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width)  # TODO check if that does not break anything
-        self.artist_emb = JukeboxSimpleEmbedding(nb_artists, out_width)
-        # self.bow_genre_emb = nn.Embedding(nb_genres, out_width) #TODO maybe test that
-        # self.artist_emb = nn.Embedding(nb_artists, out_width)
-        self.include_time_signal = include_time_signal
+
+        embed_dim = config.width
+        timing_dims = config.timing_dims
+        sampling_rate = config.sampling_rate
+        nb_genres, nb_artists = config.metadata_dims
+        music_tokens_shape = config.n_ctx
+
+        self.max_nb_genres = config.max_nb_genres
+        # self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width)  # TODO check if that does not break anything
+        # self.artist_emb = JukeboxSimpleEmbedding(nb_artists, out_width)
+        self.bow_genre_emb = nn.Embedding(nb_genres, embed_dim)  # TODO maybe test that
+        self.artist_emb = nn.Embedding(nb_artists, embed_dim)
+        self.include_time_signal = include_time_signal  # add to config
         if self.include_time_signal:
-            total_length_range = (min_duration * sampling_rate, max_duration * sampling_rate)
-            absolute_pos_range = (0.0, max_duration * sampling_rate)
+            total_length_range = (config.min_duration * sampling_rate, config.max_duration * sampling_rate)
+            absolute_pos_range = (0.0, config.max_duration * sampling_rate)
             relative_pos_range = (0.0, 1.0)
-            self.total_length_emb = JukeboxRangeEmbedding(1, timing_dims, total_length_range, out_width, init_scale)
+            self.total_length_emb = JukeboxRangeEmbedding(1, timing_dims, total_length_range, embed_dim)
             self.absolute_pos_emb = JukeboxRangeEmbedding(
-                n_time, timing_dims, absolute_pos_range, out_width, init_scale
+                music_tokens_shape, timing_dims, absolute_pos_range, embed_dim
             )
             self.relative_pos_emb = JukeboxRangeEmbedding(
-                n_time, timing_dims, relative_pos_range, out_width, init_scale, clamp=True
+                music_tokens_shape, timing_dims, relative_pos_range, embed_dim, clamp=True
             )
 
     def forward(self, metadata):
@@ -2073,181 +1821,96 @@ class JukeboxPrior(nn.Module):
     guess it is fine but otherwise it looks strange.
     """
 
-    def __init__(self, config, level, encoder=None, decoder=None):
+    def __init__(
+        self,
+        config: JukeboxPriorConfig,
+        level,
+        nb_priors=3,
+        vqvae_encoder=None,
+        vqvae_decoder=None,
+    ):
         super().__init__()
-
         # Passing functions instead of the vqvae module to avoid getting params, only used in the
         # forward loop
-        self.encoder = encoder
-        self.decoder = decoder
-
-        vqvae_music_tokens_shapes = config.vqvae_music_tokens_shapes
-
-        def rescale(music_tokens_shape):
-            return (music_tokens_shape[0] * config.prior_n_ctx[-level - 1] // vqvae_music_tokens_shapes[level][0],)
-
-        music_tokens_shapes = [rescale(music_tokens_shape) for music_tokens_shape in vqvae_music_tokens_shapes]
-        self.lyric_conditioning = config.lyric_conditioning[-level - 1]
-        self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens[-level - 1]
-        self.lyric_enc_loss_fraction = config.lyric_enc_loss_fraction[-level - 1]
+        self.vqvae_encoder = vqvae_encoder
+        self.vqvae_decoder = vqvae_decoder
 
-        self.music_tokens_shapes = music_tokens_shapes
-        self.levels = len(self.music_tokens_shapes)
+        self.levels = nb_priors
         self.level = level
-        self.music_tokens_shape = self.music_tokens_shapes[level]
-        self.latent_dim = config.prior_latent_dim
-
-        prior_kwargs = dict(
-            input_shape=(config.prior_n_ctx[-level - 1],),
-            embed_dim=config.prior_latent_dim,
-            width=config.prior_width[-level - 1],
-            depth=config.prior_depth[-level - 1],
-            heads=config.prior_n_heads[-level - 1],
-            attn_order=config.prior_attn_order[-level - 1],
-            blocks=config.prior_blocks,
-            spread=config.prior_spread,
-            attn_dropout=config.prior_attn_dropout,
-            resid_dropout=config.prior_resid_dropout,
-            emb_dropout=config.prior_emb_dropout,
-            zero_out=config.prior_zero_out,
-            res_scale=config.prior_res_scale,
-            init_scale=config.prior_init_scale[-level - 1],
-            m_attn=config.prior_m_attn,
-        )
-
-        if config.lyric_conditioning and not config.single_enc_dec[-level - 1]:
-            # lyric_enc -> lyric_enc
-            lyric_enc_kwargs = dict(
-                embed_dim=config.lyric_enc_n_vocab,
-                width=config.lyric_enc_width[-level - 1],
-                depth=config.lyric_enc_depth[-level - 1],
-                heads=config.lyric_enc_heads,
-                attn_order=config.lyric_enc_attn_order[-level - 1],
-                blocks=config.lyric_enc_blocks,
-                spread=config.lyric_enc_spread,
-                attn_dropout=config.lyric_enc_attn_dropout,
-                resid_dropout=config.lyric_enc_resid_dropout,
-                emb_dropout=config.lyric_enc_emb_dropout,
-                zero_out=config.lyric_enc_zero_out,
-                res_scale=config.lyric_enc_res_scale,
-                init_scale=config.lyric_enc_init_scale[-level - 1],
-                m_attn=config.lyric_enc_m_attn,
-                m_mlp=config.lyric_enc_m_mlp,
-            )
-        else:
-            lyric_enc_kwargs = dict(embed_dim=config.lyric_enc_n_vocab)
-
-        audio_conditioning_kwargs = dict(
-            out_width=config.prior_width[-level - 1],
-            init_scale=config.prior_init_scale[-level - 1],
-            width=config.cond_width[-level - 1],
-            depth=config.cond_depth[-level - 1],
-            m_conv=config.cond_m_conv,
-            dilation_growth_rate=config.cond_dilation_growth_rate[-level - 1],
-            dilation_cycle=config.cond_dilation_cycle[-level - 1],
-            zero_out=config.cond_zero_out,
-            res_scale=config.cond_res_scale[-level - 1],
-        )  # have to keep this else names wrong
-
-        metadata_conditioning_kwargs = dict(
-            out_width=config.prior_width[-level - 1],
-            init_scale=config.prior_init_scale[-level - 1],
-            metadata_dims=config.metadata_dims[-level - 1],
-            timing_dims=config.timing_dims,
-            sampling_rate=config.sampling_rate,
-            min_duration=config.min_duration,
-            max_duration=config.max_duration,
-            max_nb_genres=config.max_nb_genres,
-        )
+        self.n_ctx = config.n_ctx
 
-        # Audio conditioning
-        self.audio_conditioning = level != (self.levels - 1)
-        self.cond_level = level + 1
+        self.lyric_conditioning = config.nb_relevant_lyric_tokens > 0
+        self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens
+        self.encoder_loss_fraction = config.encoder_loss_fraction
 
-        # metadata conditioning
-        self.metadata_conditioning = config.metadata_conditioning
-
-        self.single_enc_dec = config.single_enc_dec[-level - 1]
         # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both)
+        self.audio_conditioning = level != 0
+        self.cond_level = level - 1
         if self.audio_conditioning:
-            self.conditioner_blocks = nn.ModuleList()
-
-            def conditioner_block(_level):
-                return JukeboxMusicTokenConditioner(
-                    input_shape=music_tokens_shapes[_level],
-                    embed_dim=config.prior_latent_dim,
-                    down_t=config.cond_downs_t[_level],
-                    stride_t=config.cond_strides_t[_level],
-                    **audio_conditioning_kwargs,
-                )
-
-            self.conditioner_blocks.append(conditioner_block(self.cond_level))
+            self.conditioner_blocks = JukeboxMusicTokenConditioner(config, self.level)
 
         # metadata conditioning : contioning on timing, genres, and artist
+        self.metadata_conditioning = config.metadata_conditioning
         if self.metadata_conditioning:
-            self.n_time = self.music_tokens_shape[0]  # Assuming STFT=TF order and raw=T1 order, so T is first dim
-            self.metadata_embedding = LabelConditioner(
-                n_time=self.n_time, include_time_signal=not self.audio_conditioning, **metadata_conditioning_kwargs
-            )
+            # Assuming STFT=TF order and raw=T1 order, so Time is first dim
+            self.metadata_embedding = LabelConditioner(config, include_time_signal=not self.audio_conditioning)
 
-        if config.single_enc_dec[-level - 1]:
-            # Single encoder-decoder transformer
-            self.prior_shapes = [(self.nb_relevant_lyric_tokens,), prior_kwargs.pop("input_shape")]
-            self.prior_embed_dim = [lyric_enc_kwargs["embed_dim"], prior_kwargs.pop("embed_dim")]
-            self.prior_dims = [np.prod(shape) for shape in self.prior_shapes]
-            self.prior_embed_dim_shift = np.cumsum([0, *self.prior_embed_dim])[:-1]
-            self.prior_width = prior_kwargs["width"]
+        # define encoder-decoder or encoder and decoder
+        self.single_enc_dec = config.single_enc_dec
+        if config.single_enc_dec:
+            # encoder-decoder transformer
+            self.input_shapes = [config.nb_relevant_lyric_tokens, config.n_ctx]
+            self.embed_dim_shift = [0, config.encoder_n_vocab]
+            self.width = config.width
 
-            # lyrics_enc_loss_dims was the lyric_enc loss dims, gen is for the generated tokens.
-            # what is the shape of the lyrics loss?
+            self.lyrics_enc_loss_dims = config.nb_relevant_lyric_tokens
 
-            self.lyrics_enc_loss_dims, self.gen_loss_dims = self.prior_dims[0], self.prior_dims[1]
-            self.total_loss_dims = self.lyrics_enc_loss_dims + self.gen_loss_dims
             self.prior = JukeboxConditionalAutoregressive(
-                input_shape=(sum(self.prior_dims),),
-                embed_dim=sum(self.prior_embed_dim),
+                config,
+                n_ctx = config.nb_relevant_lyric_tokens + config.n_ctx,
+                embed_dim = config.encoder_n_vocab + config.embed_dim,
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
                 metadata_conditioning=True,
-                lyric_enc_len=self.lyrics_enc_loss_dims,
-                **prior_kwargs,
+                encoder_len=self.lyrics_enc_loss_dims,
             )
 
         else:
             # Separate encoder-decoder transformer
+            #  we have to modify the config to use the encoder variables
+            encoder_config = self._get_encoder_config(config)
+
             if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
-                lyric_enc_input_shape = (self.nb_relevant_lyric_tokens,)
-                self.lyrics_enc_loss_dims = np.prod(lyric_enc_input_shape)
-                self.lyric_acts_width, self.lyric_enc_width = lyric_enc_kwargs["width"], prior_kwargs["width"]
-                self.lyric_encoder = JukeboxConditionalAutoregressive(
-                    input_shape=lyric_enc_input_shape,
+                self.lyrics_enc_loss_dims = self.nb_relevant_lyric_tokens
+                self.lyric_acts_width = encoder_config.width
+                self.encoder_width = config.width
+                self.encoder_dim = encoder_config.encoder_n_vocab
+                self.encoder = JukeboxConditionalAutoregressive(
+                    encoder_config,
+                    n_ctx = self.nb_relevant_lyric_tokens,
+                    embed_dim = self.encoder_dim,
                     audio_conditioning=False,
                     metadata_conditioning=False,
-                    only_encode=True,
-                    **lyric_enc_kwargs,
+                    is_encoder=True,
                 )
-                self.lyric_encoder.proj_in = JukeboxConv1D(self.lyric_acts_width, self.lyric_enc_width)
-                self.lyric_encoder.final_layer_norm = JukeboxLayerNorm(self.lyric_enc_width)
-                self.lyric_enc_dim = lyric_enc_kwargs["embed_dim"]
-                self.lyric_encoder.lm_head = nn.Linear(self.lyric_enc_width, self.lyric_enc_dim, bias=False)
-                nn.init.normal_(self.lyric_encoder.lm_head.weight, std=0.02 * prior_kwargs["init_scale"])
+                self.encoder.proj_in = JukeboxConv1D(encoder_config.width, config.width)
+                self.encoder.final_layer_norm = JukeboxLayerNorm(config.width)
+                self.encoder.lm_head = nn.Linear(config.width, encoder_config.encoder_n_vocab, bias=False)
             else:
                 self.lyrics_enc_loss_dims = 0
-            self.gen_loss_dims = np.prod(self.music_tokens_shape)
-            self.total_loss_dims = self.lyrics_enc_loss_dims + self.gen_loss_dims
 
-            # prior on the tokens
+            # decoder model on the tokens
             self.prior = JukeboxConditionalAutoregressive(
+                config,
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
-                metadata_conditioning=self.metadata_conditioning,
-                encoder_dims=self.lyrics_enc_loss_dims,
-                merged_decoder=config.merged_decoder[-level - 1],
-                **prior_kwargs,
+                metadata_conditioning=self.metadata_conditioning
             )
 
-        self.n_ctx = self.gen_loss_dims
-        self.downsamples = [stride**down for stride, down in zip(config.cond_strides_t, config.cond_downs_t)]
-        self.cond_downsample = self.downsamples[level + 1] if level != self.levels - 1 else None
-        self.raw_to_tokens = np.prod(self.downsamples[: level + 1])
+        self.next_token_prediction_loss_dims = config.n_ctx
+        self.total_loss_dims = self.lyrics_enc_loss_dims + self.next_token_prediction_loss_dims
+
+        self.downsamples = [stride**down for stride, down in zip(config.res_strides_t, config.res_downs_t)]
+        self.cond_downsample = self.downsamples[level-1] if level != 0 else None
+        self.raw_to_tokens = np.prod(self.downsamples[: nb_priors - level])
         self.sample_length = self.n_ctx * self.raw_to_tokens
 
         logger.info(
@@ -2255,6 +1918,27 @@ def conditioner_block(_level):
             f" length:{self.sample_length}"
         )
 
+    def _get_encoder_config(self, config):
+        # Set config to use the lyric encoder parameters
+        encoder_config = copy.deepcopy(config)
+        encoder_config.attn_dropout = config.encoder_attn_dropout
+        encoder_config.attention_pattern = config.encoder_attention_pattern
+        encoder_config.blocks = config.encoder_blocks
+        encoder_config.depth = config.encoder_depth
+        encoder_config.emb_dropout = config.encoder_emb_dropout
+        encoder_config.heads = config.encoder_heads
+        encoder_config.init_scale = config.encoder_init_scale
+        encoder_config.loss_fraction = config.encoder_loss_fraction
+        encoder_config.attention_multiplier = config.encoder_attention_multiplier
+        encoder_config.mlp_multiplier = config.encoder_mlp_multiplier
+        encoder_config.resid_dropout = config.encoder_resid_dropout
+        encoder_config.res_scale = config.encoder_res_scale
+        encoder_config.spread = config.encoder_spread
+        encoder_config.width = config.encoder_width
+        encoder_config.zero_out = config.encoder_zero_out
+        encoder_config.n_vocab = config.encoder_n_vocab
+        return encoder_config
+
     def get_metadata(self, labels, start, total_length, offset, get_indices=False):
         metadata = labels.clone()
         metadata[:, 0] = total_length
@@ -2301,8 +1985,8 @@ def get_music_tokens_conds(self, music_tokens, start, end):
         """
         Extracts current level's conditioning music tokens.
         """
-        if self.level != self.levels - 1:
-            music_tokens_cond = music_tokens[self.level + 1]
+        if self.level != 0:
+            music_tokens_cond = music_tokens[self.level - 1]
             music_tokens = music_tokens_cond[:, start // self.cond_downsample : end // self.cond_downsample]
             missing_cond_len = self.n_ctx // self.cond_downsample - music_tokens_cond[-1].shape[-1]
             if missing_cond_len > 0:
@@ -2320,49 +2004,44 @@ def prior_preprocess(self, tokens, conds):
         """
         batch_size = tokens[0].shape[0]
         for i in range(len(tokens)):
-            tokens[i] = (tokens[i] + int(self.prior_embed_dim_shift[i])).view(batch_size, -1)
+            tokens[i] = (tokens[i] + int(self.embed_dim_shift[i])).view(batch_size, -1)
 
         for i in range(len(conds)):
-            cond, dims = conds[i], self.prior_dims[i]
-            if cond is None:
+            if conds[i] is None:
                 conds[i] = torch.zeros(
-                    (batch_size, dims, self.prior_width),
-                    dtype=self.prior.transformer._attn_mods[0].mlp.c_fc.weight.dtype,
-                    device=tokens[0].device,
+                    (batch_size, self.input_shapes[i], self.width), dtype=tokens[0].dtype, device=tokens[0].device
                 )
 
         return torch.cat(tokens, dim=1), torch.cat(conds, dim=1)
 
     def prior_postprocess(self, tokens):
         """
-        Shifts back the input tokens if the model is uses an encoder decoder architecture. As the embedding layer is
+        Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
         shared, prior_embed_dim_shift shifts the music token ids by
          - nb_vocab.
         Returns : only returns the music tokens
         """
         batch_size = tokens.shape[0]
-        # dim (nb_lyric_tokens, vqvae_codebook dim = latent_dim of the model)
-        dims = (self.prior_dims[0], tokens.shape[1] - self.prior_dims[0])
+        # dim (nb_lyric_tokens, codebook dim = latent_dim of the model)
+        dims = (self.input_shapes[0], tokens.shape[1] - self.input_shapes[0])
         tokens = list(torch.split(tokens, dims, dim=1))
 
         # Some of the input tokens might be shifted to take into account the voccabulary fusion
         for i in range(len(tokens)):
-            shape = self.prior_shapes[i]
-            _, bins_shift = int(self.prior_embed_dim[i]), int(self.prior_embed_dim_shift[i])  # bins, -> _,
-            tokens[i] = (tokens[i] - bins_shift).view(batch_size, -1, *shape[1:])
-            tokens[i] = torch.clamp(
-                tokens[i], min=0
-            )  # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
-
+            shape = dims[i]
+            bins_shift = int(self.embed_dim_shift[i])
+            tokens[i] = (tokens[i] - bins_shift).view(batch_size, -1)
+            tokens[i] = torch.clamp(tokens[i], min=0)
+            # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
         return tokens[-1]
 
     def embed_tokens(self, music_tokens_conds):
         """
         Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
         """
-        music_tokens_conds = music_tokens_conds[: self.cond_level - self.level]
+        music_tokens_conds = music_tokens_conds[: self.cond_level]
         audio_conditioning = None
-        for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, self.conditioner_blocks))):
+        for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, [self.conditioner_blocks]))):
             audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning)
         return audio_conditioning
 
@@ -2433,19 +2112,19 @@ def sample(
             # Currently audio_conditioning only uses immediately above layer
             audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
             if self.single_enc_dec:
-                if no_past_context:
-                    music_tokens, audio_conditioning = self.prior_preprocess(
+                if no_past_context:  # the prime_sample function will be used with music_tokens set to None
+                    lyric_and_music_tokens, audio_conditioning = self.prior_preprocess(
                         [lyric_tokens], [None, audio_conditioning]
                     )
                 else:
-                    music_tokens, audio_conditioning = self.prior_preprocess(
+                    lyric_and_music_tokens, audio_conditioning = self.prior_preprocess(
                         [lyric_tokens, music_tokens], [None, audio_conditioning]
                     )
                 if sample_tokens is not None:
                     sample_tokens += self.nb_relevant_lyric_tokens
-                tokens = self.prior.primed_sample(
+                music_tokens = self.prior.primed_sample(
                     n_samples,
-                    music_tokens,
+                    lyric_and_music_tokens,
                     audio_conditioning,
                     metadata_conditioning,
                     temp=temp,
@@ -2454,15 +2133,15 @@ def sample(
                     chunk_size=chunk_size,
                     sample_tokens=sample_tokens,
                 )
-                music_tokens = self.prior_postprocess(tokens)
+                music_tokens = self.prior_postprocess(music_tokens)
             else:
-                lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens, sample=True)
+                last_encoder_hidden_states = self.get_encoder_states(lyric_tokens, sample=True)
                 if no_past_context:
                     music_tokens = self.prior.sample(
                         n_samples,
                         audio_conditioning,
                         metadata_conditioning,
-                        lyric_encoder_states,
+                        last_encoder_hidden_states,
                         temp=temp,
                         top_k=top_k,
                         top_p=top_p,
@@ -2474,7 +2153,7 @@ def sample(
                         music_tokens,
                         audio_conditioning,
                         metadata_conditioning,
-                        lyric_encoder_states,
+                        last_encoder_hidden_states,
                         temp=temp,
                         top_k=top_k,
                         top_p=top_p,
@@ -2483,36 +2162,33 @@ def sample(
                     )
         return music_tokens
 
-    def get_lyric_encoder_states(self, lyric_tokens, sample=False):
+    def get_encoder_states(self, lyric_tokens, sample=False):
         """
         Retreive the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
         the lyric encoder.
         """
         if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
             if sample:
-                self.lyric_encoder = self.lyric_encoder.to(lyric_tokens.device)
-            lyric_acts = self.lyric_encoder(lyric_tokens, None, None, None)
-            lyric_acts = self.lyric_encoder.proj_in(lyric_acts)
-            lyric_encoder_states = self.lyric_encoder.final_layer_norm(lyric_acts)
-            if sample:
-                self.lyric_encoder.cpu()
+                self.encoder = self.encoder.to(lyric_tokens.device)
+            lyric_acts = self.encoder(lyric_tokens, None, None, None)
+            lyric_acts = self.encoder.proj_in(lyric_acts)
+            last_encoder_hidden_states = self.encoder.final_layer_norm(lyric_acts)
         else:
-            lyric_encoder_states = None
-        return lyric_encoder_states
+            last_encoder_hidden_states = None
+        return last_encoder_hidden_states
 
-    def get_lyric_enc_loss(self, lyric_encoder_states, target_lyrics):
+    def get_encoder_loss(self, last_encoder_hidden_states, target_lyrics):
         """
-        Computes the loss for the lyric encoder, next token prediction.
+        Computes the loss for the lyric encoder: next lyric token prediction.
         """
         if self.lyric_conditioning:
-            # lyric_encoder_states = lyric_encoder_states.float()
-            lyric_encoder_states = self.lyric_encoder.lm_head(lyric_encoder_states)
-            lyric_enc_loss = nn.functional.cross_entropy(
-                lyric_encoder_states.view(-1, self.lyric_enc_dim), target_lyrics.view(-1)
+            last_encoder_hidden_states = self.encoder.lm_head(last_encoder_hidden_states)
+            encoder_loss = nn.functional.cross_entropy(
+                last_encoder_hidden_states.view(-1, self.encoder_dim), target_lyrics.view(-1)
             ) / np.log(2.0)
         else:
-            lyric_enc_loss = torch.tensor(0.0, device="cuda")
-        return lyric_enc_loss
+            encoder_loss = torch.tensor(0.0, device=last_encoder_hidden_states.device)
+        return encoder_loss
 
     def forward_tokens(
         self, music_tokens, music_tokens_conds=[], metadata=None, get_preds=False, get_attn_weights=False
@@ -2530,30 +2206,30 @@ def forward_tokens(
             self.prior.transformer.set_record_attn(get_attn_weights)
         audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
 
-        if self.single_enc_dec:  # the preprocess returns the full tokens, shifted
+        if self.single_enc_dec:  # the preprocess returns the full tokens (Lyrics and Music tokens), shifted
             tokens, audio_conditioning = self.prior_preprocess(
                 [lyric_tokens, music_tokens], [None, audio_conditioning]
             )
-            (lyric_enc_loss, gen_loss), preds = self.prior(
+            (encoder_loss, next_token_prediction_loss), preds = self.prior(
                 tokens, audio_conditioning, metadata_conditioning, get_sep_loss=True, get_preds=get_preds
             )
         else:
-            lyric_encoder_states = self.get_lyric_encoder_states(lyric_tokens)
-            lyric_enc_loss = self.get_lyric_enc_loss(lyric_encoder_states, lyric_tokens)
-            gen_loss, preds = self.prior(
+            last_encoder_hidden_states = self.get_encoder_states(lyric_tokens)
+            encoder_loss = self.get_encoder_loss(last_encoder_hidden_states, lyric_tokens)
+            next_token_prediction_loss, preds = self.prior(
                 music_tokens,
                 audio_conditioning,
                 metadata_conditioning,
-                lyric_encoder_states,
+                last_encoder_hidden_states,
                 get_preds=get_preds,
             )
-        loss = (self.lyric_enc_loss_fraction * lyric_enc_loss * self.lyrics_enc_loss_dims / self.total_loss_dims) + (
-            gen_loss * self.gen_loss_dims / self.total_loss_dims
-        )
+        loss = self.encoder_loss_fraction * encoder_loss * self.lyrics_enc_loss_dims / self.total_loss_dims
+        loss += next_token_prediction_loss * self.next_token_prediction_loss_dims / self.total_loss_dims
+
         metrics = dict(
-            bpd=gen_loss.clone().detach(),
-            lyric_enc_loss=lyric_enc_loss.clone().detach(),
-            gen_loss=gen_loss.clone().detach(),
+            bpd=next_token_prediction_loss.clone().detach(),
+            encoder_loss=encoder_loss.clone().detach(),
+            next_token_prediction_loss=next_token_prediction_loss.clone().detach(),
         )
         if get_preds:
             metrics["preds"] = preds.clone().detach()
@@ -2587,7 +2263,8 @@ class JukeboxPreTrainedModel(PreTrainedModel):
     """
 
     config_class = JukeboxConfig
-    base_model_prefix = "transformer"
+    base_model_prefix = "jukebox"
+    supports_gradient_checkpointing = False
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -2598,6 +2275,47 @@ def _init_weights(self, module):
         elif isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=std)
 
+        # TODO handle all the initialisation and zero_out from the rest of the code
+        elif isinstance(module, JukeboxRangeEmbedding):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, JukeboxSimpleEmbedding):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, JukeboxAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, JukeboxMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, JukeboxModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
 
@@ -2625,9 +2343,25 @@ class JukeboxModel(JukeboxPreTrainedModel):
 
     def __init__(self, config):
         super().__init__(config)
-        self.vqvae = JukeboxVQVAE(config)
-        config.vqvae_music_tokens_shapes = self.vqvae.music_tokens_shapes
-        self.priors = nn.ModuleList([JukeboxPrior(config, level=i) for i in range(config.nb_priors)])
+        vqvae_config = config.vqvae_config
+        self.vqvae = JukeboxVQVAE(vqvae_config)
+        self.set_shared_params(config)
+        self.priors = nn.ModuleList()
+        for level in range(config.nb_priors):
+            self.priors.append(JukeboxPrior(config.prior_configs[level], level))
+
+    def set_shared_params(self, model_config):
+        """
+        Initialises the parameters that are shared. This has to be done here
+        because the list of PiroConfig is nest, and is thus unreachable in the `from_dict` function
+        """
+        for config in model_config.prior_configs:
+            config.sampling_rate = model_config.sampling_rate
+            config.timing_dims = model_config.timing_dims
+            config.min_duration = model_config.min_duration
+            config.max_duration = model_config.max_duration
+            config.max_nb_genres =  model_config.max_nb_genres
+            config.metadata_conditioning = model_config.metadata_conditioning
 
     def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):
         return self.vqvae.decode(music_tokens, start_level, end_level, bs_chunks)
@@ -2664,7 +2398,7 @@ def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, l
     # Sample a single window of length=n_ctx at position=start on level=level
     def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start):
         prior = self.priors[level]
-        n_samples = music_tokens[-1].shape[0]
+        n_samples = music_tokens[0].shape[0]
         n_ctx = prior.n_ctx
         end = start + n_ctx
         # get music_tokens already sampled at current level
@@ -2822,7 +2556,7 @@ def _sample(
              1804,  541, 1804, 1434]])
         ```"""
 
-        top_prior = self.priors[-1]
+        top_prior = self.priors[0]
         if sample_length is not None:
             total_length = sample_length
         else:
@@ -2836,32 +2570,29 @@ def _sample(
         self.total_length = (
             total_length  # total length of the signal, might be bit different from the actual generated length
         )
-        for level in reversed(sample_levels):
+        for level in sample_levels:
             sampling_kwargs = dict(
                 temp=0.99 if level == 0 else sampling_temperature,
                 max_batch_size=lower_batch_size if level != sample_levels else max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
             )
-
-            self.priors[level].to(music_tokens[level].device).eval()
             # Set correct total_length, hop_length, labels and sampling_kwargs for level
 
             total_token_to_sample = total_length // self.priors[level].raw_to_tokens
-            hop_length = int(self.config.hop_fraction[-level - 1] * self.priors[level].n_ctx)
+            hop_length = int(self.config.hop_fraction[level] * self.priors[level].n_ctx)
 
             music_tokens = self.sample_level(
                 music_tokens, labels[level], offset, sampling_kwargs, level, total_token_to_sample, hop_length
             )
 
-            self.priors[level].to("cpu")
+            # todo add wrapper to automatically send to cpu if unused
             self.vqvae.to(music_tokens[level].device)
             # Decode sample
             with torch.no_grad():
                 raw_audio = self.vqvae.decode(
                     music_tokens[level:], start_level=level, bs_chunks=music_tokens[level].shape[0]
                 )
-            self.vqvae.to("cpu")  # save RAM
 
             if save_results:
                 logdir = f"jukebox/level_{level}"
@@ -2870,9 +2601,9 @@ def _sample(
                 save_temp_audio(
                     logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate
                 )
-                if compute_alignments and self.priors[-1] is not None and self.priors[-1].nb_relevant_lyric_tokens > 0:
+                if compute_alignments and self.priors[0] is not None and self.priors[0].nb_relevant_lyric_tokens > 0:
                     with torch.no_grad():
-                        alignments = get_alignment(music_tokens, labels[-1], self.priors[-1], self.config)
+                        alignments = get_alignment(music_tokens, labels[-1], self.priors[0], self.config)
                     torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")
 
         return music_tokens
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 5fc749c3de6b2..49ab4669f01d5 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -99,14 +99,14 @@ class JukeboxTokenizer(PreTrainedTokenizer):
             Path to the vocabulary file which contain a mapping between genres and ids.
         lyrics_file (`str`):
             Path to the vocabulary file which contains the accepted characters for the lyrics tokenization.
-        version (`List[`str`], `optional`, default to ["v3", "v2", "v2"]) :
+        version (`List[str]`, `optional`, default to `["v3", "v2", "v2"]`) :
             List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
             `v2`.
         n_genres (`int`, `optional`, defaults to 1):
             Maximum number of genres to use for composition.
         max_n_lyric_tokens (`int`, `optional`, defaults to 512):
             Maximum number of lyric tokens to keep.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
     """

From f629b093a93b411d3a979ec15734a5af31f74efb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 2 Nov 2022 20:58:54 +0000
Subject: [PATCH 148/196] fixup

---
 .../models/jukebox/configuration_jukebox.py   | 34 ++++---
 .../models/jukebox/convert_jukebox.py         |  7 +-
 .../models/jukebox/modeling_jukebox.py        | 89 +++++++++++--------
 3 files changed, 69 insertions(+), 61 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index b56c5b60f7958..16d4ae4bbb02f 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -255,12 +255,12 @@ class JukeboxPriorConfig(PretrainedConfig):
 
     def __init__(
         self,
-        sampling_rate = 44100,
-        timing_dims = 64,
-        min_duration = 0,
-        max_duration = 600,
-        max_nb_genres =  1,
-        metadata_conditioning = True,
+        sampling_rate=44100,
+        timing_dims=64,
+        min_duration=0,
+        max_duration=600,
+        max_nb_genres=1,
+        metadata_conditioning=True,
         zero_out=False,
         res_conv_depth=3,
         res_conv_width=128,
@@ -374,10 +374,9 @@ def __init__(
         self.timing_dims = timing_dims
         self.min_duration = min_duration
         self.max_duration = max_duration
-        self.max_nb_genres =  max_nb_genres
+        self.max_nb_genres = max_nb_genres
         self.metadata_conditioning = metadata_conditioning
 
-
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
 
@@ -497,6 +496,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         return cls.from_dict(config_dict, **kwargs)
 
+
 class JukeboxConfig(PretrainedConfig):
     """
     This is the configuration class to store the configuration of a [`JukeboxModel`].
@@ -575,7 +575,7 @@ def __init__(
         min_duration=0,
         max_duration=600.0,
         max_nb_genres=5,
-        metadata_conditioning = True,
+        metadata_conditioning=True,
         init_std=0.2,
         **kwargs,
     ):
@@ -585,7 +585,7 @@ def __init__(
             logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
 
         self.vqvae_config = JukeboxVQVAEConfig(**vqvae_config)
-        if prior_config_list is not None :
+        if prior_config_list is not None:
             self.prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list]
         else:
             self.prior_configs = []
@@ -593,10 +593,12 @@ def __init__(
                 prior_config = kwargs.pop(f"prior_{prior_idx}", None)
                 if prior_config is None:
                     prior_config = {}
-                    logger.info(f"prior_{prior_idx}'s  config is None. Initializing the JukeboxPriorConfig list with default values.")
+                    logger.info(
+                        f"prior_{prior_idx}'s  config is None. Initializing the JukeboxPriorConfig list with default"
+                        " values."
+                    )
                 self.prior_configs.append(JukeboxPriorConfig(**prior_config))
 
-
         self.hop_fraction = self.vqvae_config.hop_fraction
 
         self.init_std = init_std
@@ -613,9 +615,7 @@ def __init__(
         super().__init__(**kwargs)
 
     @classmethod
-    def from_configs(
-        cls, prior_configs: List[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs
-    ):
+    def from_configs(cls, prior_configs: List[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs):
         r"""
         Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
         configuration.
@@ -634,11 +634,9 @@ def to_dict(self):
             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
-        for i,config in enumerate(output.pop("prior_configs")):
+        for i, config in enumerate(output.pop("prior_configs")):
             output[f"prior_{i}"] = config.to_dict()
 
         output["vqvae_config"] = self.vqvae_config.to_dict()
         output["model_type"] = self.__class__.model_type
         return output
-
-
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 25f0b58ed4c68..75e382de29adc 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -57,16 +57,13 @@ def replace_key(key):
     elif key.endswith(".model.3.weight") and len(key.split(".")) > 10:
         key = key.replace(".model.3.weight", ".conv1d_2.weight")
 
-
-
     if "conditioner_blocks.0." in key:
         key = key.replace("conditioner_blocks.0", "conditioner_blocks")
 
-
     if "prime_prior" in key:
         key = key.replace("prime_prior", "encoder")
 
-    if ".emb." in key and not "total" in key and not "absolute" in key and not "relative"  in key:
+    if ".emb." in key and not "total" in key and not "absolute" in key and not "relative" in key:
         key = key.replace(".emb.", ".")
 
     if key.endswith("k"):  # replace vqvae.X.k with vqvae.X.codebook
@@ -273,7 +270,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     vqvae_state_dict = weight_dict.pop(0)
     model.vqvae.load_state_dict(vqvae_state_dict)
     for i in range(len(weight_dict)):
-        model.priors[i].load_state_dict(weight_dict[2-i])
+        model.priors[i].load_state_dict(weight_dict[2 - i])
 
     Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
     with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile:
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index e7bff0c6baa62..6acf02c3227a4 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -29,7 +29,7 @@
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, logging
 from ...utils.logging import tqdm
-from .configuration_jukebox import JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig, ATTENTION_PATTERNS
+from .configuration_jukebox import ATTENTION_PATTERNS, JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig
 
 
 logger = logging.get_logger(__name__)
@@ -95,7 +95,9 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
     """
     full_tokens = full_tokens[0]
     if len(full_tokens) < max_n_lyric_tokens:
-        tokens = torch.cat([torch.zeros(max_n_lyric_tokens - len(full_tokens), dtype = torch.long).to(full_tokens.device), full_tokens])
+        tokens = torch.cat(
+            [torch.zeros(max_n_lyric_tokens - len(full_tokens), dtype=torch.long).to(full_tokens.device), full_tokens]
+        )
         indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
     else:
         midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
@@ -237,7 +239,7 @@ class JukeboxResConv1DBlock(nn.Module):
     def __init__(self, config, conv_width, depth=1, res_scale=1.0):
         super().__init__()
         hidden_dim = config.res_convolution_multiplier * conv_width
-        dilation = config.res_dilation_growth_rate ** depth
+        dilation = config.res_dilation_growth_rate**depth
         padding = dilation
 
         self.res_scale = res_scale
@@ -283,9 +285,7 @@ def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t):
         pad_t = stride_t // 2
         if down_t > 0:
             for i in range(down_t):
-                blocks.append(
-                    nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, filter_t, stride_t, pad_t)
-                )
+                blocks.append(nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, filter_t, stride_t, pad_t))
                 blocks.append(JukeboxResnet1D(config, hidden_dim, depth))
         self.proj_out = nn.Conv1d(hidden_dim, config.embed_dim, 3, 1, 1)
         self.downsample_block = nn.ModuleList(blocks)
@@ -305,7 +305,11 @@ def __init__(self, config, width, depth, levels, downs_t, strides_t):
 
         iterator = zip(list(range(self.levels)), downs_t, strides_t)
         for i, down_t, stride_t in iterator:
-            self.level_blocks.append(JukeboxEncoderConvBlock(config, config.conv_input_shape if i == 0 else config.embed_dim, width, depth, down_t, stride_t))
+            self.level_blocks.append(
+                JukeboxEncoderConvBlock(
+                    config, config.conv_input_shape if i == 0 else config.embed_dim, width, depth, down_t, stride_t
+                )
+            )
 
     def forward(self, hidden_states):
         all_hidden_states = []
@@ -320,7 +324,7 @@ def forward(self, hidden_states):
 
 
 class JukeboxDecoderConvBock(nn.Module):
-    def __init__(self,config,embed_dim,hidden_dim,depth,down_t,stride_t):
+    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t):
         self.embed_dim = embed_dim
         self.hidden_dim = hidden_dim
         super().__init__()
@@ -331,7 +335,11 @@ def __init__(self,config,embed_dim,hidden_dim,depth,down_t,stride_t):
             self.proj_in = nn.Conv1d(embed_dim, hidden_dim, 3, 1, 1)
             for i in range(down_t):
                 blocks.append(JukeboxResnet1D(config, hidden_dim, depth, reverse_dilation=True))
-                blocks.append(nn.ConvTranspose1d(hidden_dim, hidden_dim if i<down_t-1 else embed_dim, filter_t, stride_t, pad_t))
+                blocks.append(
+                    nn.ConvTranspose1d(
+                        hidden_dim, hidden_dim if i < down_t - 1 else embed_dim, filter_t, stride_t, pad_t
+                    )
+                )
         self.upsample_block = nn.ModuleList(blocks)
 
     def forward(self, hidden_states):
@@ -347,7 +355,9 @@ def __init__(self, config, hidden_dim, depth, levels, downs_t, strides_t):
         self.levels = levels
         self.level_blocks = nn.ModuleList()
         for level, down_t, stride_t in zip(list(range(self.levels)), downs_t, strides_t):
-            self.level_blocks.append(JukeboxDecoderConvBock(config, config.embed_dim, hidden_dim, depth, down_t, stride_t))
+            self.level_blocks.append(
+                JukeboxDecoderConvBock(config, config.embed_dim, hidden_dim, depth, down_t, stride_t)
+            )
 
         self.out = nn.Conv1d(config.embed_dim, config.conv_input_shape, 3, 1, 1)
 
@@ -796,7 +806,6 @@ def forward(self, input):
 
 
 class JukeboxAttention(nn.Module):
-
     def __init__(
         self,
         config,
@@ -813,11 +822,11 @@ def __init__(
         self.head_dim = hidden_dim // config.n_heads
         self.n_ctx = n_ctx  # NOTE: n_ctx could be different within operations. This is complete n_ctx
         self.hidden_dim = hidden_dim
-        self.scale = self.head_dim**-0.25 #TODO check  1.0 / math.sqrt(math.sqrt(self.hidden_dim // self.n_heads))
+        self.scale = self.head_dim**-0.25  # TODO check  1.0 / math.sqrt(math.sqrt(self.hidden_dim // self.n_heads))
         self.mask = config.mask
 
         if attn_func == "cross_attention":
-            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim ) # issue here, for single enc decoder different
+            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim)  # issue here, for single enc decoder different
             self.c_enc_kv = JukeboxConv1D(self.embed_dim, hidden_dim * 2)
         else:
             self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim * 3)
@@ -1156,12 +1165,14 @@ def _suff_cache_len(self):
             raise NotImplementedError()
 
         REQUIRED_CACHE_LEN = {
-            "dense_attn":self.sample_t,
-            "block_attn":(self.sample_t - 1) % self.block_ctx + 1,
-            "transpose_block_attn":self.sample_t,
-            "prev_block_attn": self.sample_t if self.sample_t <= self.block_ctx  else (self.sample_t - 1) % self.block_ctx + 1 + self.block_ctx,
-            "cross_attn":self.encoder_len,
-            "prime_attn":min(self.sample_t, self._encoder_len)
+            "dense_attn": self.sample_t,
+            "block_attn": (self.sample_t - 1) % self.block_ctx + 1,
+            "transpose_block_attn": self.sample_t,
+            "prev_block_attn": self.sample_t
+            if self.sample_t <= self.block_ctx
+            else (self.sample_t - 1) % self.block_ctx + 1 + self.block_ctx,
+            "cross_attn": self.encoder_len,
+            "prime_attn": min(self.sample_t, self._encoder_len),
         }
 
         return REQUIRED_CACHE_LEN[self.attn_func]
@@ -1250,18 +1261,16 @@ def __init__(
         self.encoder_len = encoder_len
         self.n_heads = config.n_heads
 
-
         # Orders of attn_func
         attention_pattern = ATTENTION_PATTERNS[self.attention_pattern]
         self._attn_mods = nn.ModuleList()
         for depth in range(self.depth):
             self._attn_mods.append(
-                JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth),encoder_len=encoder_len)
+                JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth), encoder_len=encoder_len)
             )
 
         self.saved_attn_weights = []
 
-
     def set_record_attn(self, record_attn):
         """
         Arguments:
@@ -1314,8 +1323,8 @@ class JukeboxConditionalAutoregressive(nn.Module):
     def __init__(
         self,
         config,
-        n_ctx = None,
-        embed_dim = None,
+        n_ctx=None,
+        embed_dim=None,
         audio_conditioning=False,
         metadata_conditioning=False,
         encoder_len=0,
@@ -1324,8 +1333,7 @@ def __init__(
         """
         - embed_dim : either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook
         dimension, if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
-        for the lyric tokens.
-        The width corresponds to the number of tokens or lyrics tokens provided in a single pass.
+        for the lyric tokens. The width corresponds to the number of tokens or lyrics tokens provided in a single pass.
         It can be different from the embed dim.
         - audio_conditioning : whether or not the prior supports conditionning on audio.
         - metadata_conditioning : whether or not the prior supports conditionning on artitst, genres, lyrics and
@@ -1349,7 +1357,7 @@ def __init__(
         self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.width)
         self.pos_emb_dropout = nn.Dropout(config.emb_dropout)
 
-        self.transformer = JukeboxLayerStack(config,n_ctx=self.n_ctx,encoder_len=encoder_len)
+        self.transformer = JukeboxLayerStack(config, n_ctx=self.n_ctx, encoder_len=encoder_len)
         self.is_encoder = is_encoder
         self.encoder_len = encoder_len
 
@@ -1367,6 +1375,7 @@ def __init__(
                 self.fc_proj_out.weight = self.embed_tokens.weight
             self.loss = torch.nn.CrossEntropyLoss()
             # nn.init.normal_(self.encoder.lm_head.weight, std=0.02 * decoder_config["init_scale"])
+
     def postprocess(self, tokens, sample_tokens=None):
         # Convert back from NL and long to NHWC
         batch_size = tokens.shape[0]
@@ -1655,10 +1664,14 @@ def __init__(self, config, level):
         super().__init__()
         self.embed_tokens = nn.Embedding(config.embed_dim, config.width)
 
-
         # JukeboxMusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level?
         self.upsampler = JukeboxDecoderConvBock(
-            config, config.width, config.res_conv_width, config.res_conv_depth, config.res_downs_t[level], config.res_strides_t[level]
+            config,
+            config.width,
+            config.res_conv_width,
+            config.res_conv_depth,
+            config.res_downs_t[level],
+            config.res_strides_t[level],
         )
         self.layer_norm = JukeboxLayerNorm(config.width)
 
@@ -1867,8 +1880,8 @@ def __init__(
 
             self.prior = JukeboxConditionalAutoregressive(
                 config,
-                n_ctx = config.nb_relevant_lyric_tokens + config.n_ctx,
-                embed_dim = config.encoder_n_vocab + config.embed_dim,
+                n_ctx=config.nb_relevant_lyric_tokens + config.n_ctx,
+                embed_dim=config.encoder_n_vocab + config.embed_dim,
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
                 metadata_conditioning=True,
                 encoder_len=self.lyrics_enc_loss_dims,
@@ -1886,8 +1899,8 @@ def __init__(
                 self.encoder_dim = encoder_config.encoder_n_vocab
                 self.encoder = JukeboxConditionalAutoregressive(
                     encoder_config,
-                    n_ctx = self.nb_relevant_lyric_tokens,
-                    embed_dim = self.encoder_dim,
+                    n_ctx=self.nb_relevant_lyric_tokens,
+                    embed_dim=self.encoder_dim,
                     audio_conditioning=False,
                     metadata_conditioning=False,
                     is_encoder=True,
@@ -1902,14 +1915,14 @@ def __init__(
             self.prior = JukeboxConditionalAutoregressive(
                 config,
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
-                metadata_conditioning=self.metadata_conditioning
+                metadata_conditioning=self.metadata_conditioning,
             )
 
         self.next_token_prediction_loss_dims = config.n_ctx
         self.total_loss_dims = self.lyrics_enc_loss_dims + self.next_token_prediction_loss_dims
 
         self.downsamples = [stride**down for stride, down in zip(config.res_strides_t, config.res_downs_t)]
-        self.cond_downsample = self.downsamples[level-1] if level != 0 else None
+        self.cond_downsample = self.downsamples[level - 1] if level != 0 else None
         self.raw_to_tokens = np.prod(self.downsamples[: nb_priors - level])
         self.sample_length = self.n_ctx * self.raw_to_tokens
 
@@ -2352,15 +2365,15 @@ def __init__(self, config):
 
     def set_shared_params(self, model_config):
         """
-        Initialises the parameters that are shared. This has to be done here
-        because the list of PiroConfig is nest, and is thus unreachable in the `from_dict` function
+        Initialises the parameters that are shared. This has to be done here because the list of PiroConfig is nest,
+        and is thus unreachable in the `from_dict` function
         """
         for config in model_config.prior_configs:
             config.sampling_rate = model_config.sampling_rate
             config.timing_dims = model_config.timing_dims
             config.min_duration = model_config.min_duration
             config.max_duration = model_config.max_duration
-            config.max_nb_genres =  model_config.max_nb_genres
+            config.max_nb_genres = model_config.max_nb_genres
             config.metadata_conditioning = model_config.metadata_conditioning
 
     def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):

From 28422e7787ad4833f2fcd514bfa1295cd0d7ad66 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 2 Nov 2022 20:59:07 +0000
Subject: [PATCH 149/196] local test now follow the patterm

---
 tests/models/jukebox/test_modeling_jukebox.py | 67 ++++++++++---------
 1 file changed, 35 insertions(+), 32 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 030dc528a48c9..002d6293612b8 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -28,7 +28,7 @@
 @require_torch
 class Jukebox1bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-1b-lyrics"
+    model_id = "/home/arthur_huggingface_co/transformers/jukebox-1b-lyrics-converted"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -135,16 +135,16 @@ def test_sampling(self):
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
+        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[-2].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         set_seed(0)
-        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[-3].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
+        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     def test_slow_sampling(self):
@@ -156,7 +156,7 @@ def test_slow_sampling(self):
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
 
-        top_prior = model.priors[-1]
+        top_prior = model.priors[0]
         start = 0
         z_conds = top_prior.get_music_tokens_conds(zs, start=start, end=start + top_prior.n_ctx)
         y = top_prior.get_metadata(labels[-1].clone(), start, 1058304, 0)
@@ -166,16 +166,16 @@ def test_slow_sampling(self):
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
 
         set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[-2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
 
         set_seed(0)
-        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[-3].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
+        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
     @slow
     def test_primed_sampling(self):
@@ -186,25 +186,28 @@ def test_primed_sampling(self):
         waveform = torch.rand((1, 5120, 1))
         tokens = [i.cuda() for i in self.prepare_inputs()]
 
-        zs = [None, None, model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0].cuda()]
+        model.priors[0].cuda()
+        zs = [model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0].cuda(), None, None]
         zs = model._sample(
-            zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[-1].raw_to_tokens
+            zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[0].raw_to_tokens
         )
-        assert torch.allclose(zs[-1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
+        assert torch.allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
 
-        upper_2 = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).cuda()), dim=-1).long()
-        zs = [None, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), upper_2]
+        model.priors[1].cuda()
+        upper_2 = torch.cat((zs[0], torch.zeros(1, 1000000 - zs[0].shape[-1]).cuda()), dim=-1).long()
+        zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), None]
         zs = model._sample(
             zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[-2].raw_to_tokens
         )
         assert torch.allclose(zs[1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_1))
 
+        model.priors[2].cuda()
         upper_1 = torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[-1]).cuda()), dim=-1).long()
         zs = [model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0].cuda(), upper_1, upper_2]
         zs = model._sample(
-            zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[-3].raw_to_tokens
+            zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
         )
-        assert torch.allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
+        assert torch.allclose(zs[2][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
 
     @slow
     def test_vqvae(self):
@@ -223,7 +226,7 @@ def test_vqvae(self):
 @require_torch
 class Jukebox5bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-5b-lyrics"
+    model_id = "/home/arthur_huggingface_co/transformers/jukebox-5b-lyrics-converted"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -305,16 +308,16 @@ def test_sampling(self):
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-1][0], torch.tensor(self.EXPECTED_OUTPUT_2))
+        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
+        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         set_seed(0)
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_0))
+        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     def test_slow_sampling(self):
@@ -323,16 +326,16 @@ def test_slow_sampling(self):
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
 
         set_seed(0)
         zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
 
         set_seed(0)
-        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[-3].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
+        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
     @slow
     def test_fp16_slow_sampling(self):
@@ -341,5 +344,5 @@ def test_fp16_slow_sampling(self):
 
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[-1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))

From f739f95acbeae737ffd49a538ef066633d4b85a3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 3 Nov 2022 10:10:18 +0000
Subject: [PATCH 150/196] fix last test and correct initialisation pattern

---
 .../models/jukebox/modeling_jukebox.py        | 83 +++++--------------
 tests/models/jukebox/test_modeling_jukebox.py |  1 +
 2 files changed, 20 insertions(+), 64 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 6acf02c3227a4..8c499bb830972 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -219,7 +219,6 @@ def __init__(self, input_width, output_width):
         self.input_width = input_width
         self.output_width = output_width
         weight = torch.empty(input_width, output_width)
-
         bias = torch.zeros(output_width)
         self.weight = nn.Parameter(weight)
         self.bias = nn.Parameter(bias)
@@ -267,7 +266,7 @@ def __init__(self, config, conv_width, n_depth, reverse_dilation=False):
             block_depth = depth if self.dilation_cycle is None else depth % self.dilation_cycle
             blocks.append(JukeboxResConv1DBlock(config, conv_width, block_depth, res_scale))
 
-        if not reverse_dilation:
+        if reverse_dilation:
             blocks = blocks[::-1]
         self.resnet_block = nn.ModuleList(blocks)
 
@@ -1312,7 +1311,6 @@ class JukeboxPositionalEmbedding(nn.Module):
     def __init__(self, embed_dim, width):
         super().__init__()
         self.pos_emb = nn.Parameter(torch.empty((embed_dim, width)))
-        # nn.init.normal_(self.pos_emb, std=0.01 * init_scale)
 
     def forward(self):
         pos_emb = self.pos_emb
@@ -1347,13 +1345,11 @@ def __init__(
         self.n_ctx = n_ctx if n_ctx is not None else config.n_ctx
         self.embed_dim = embed_dim if embed_dim is not None else config.embed_dim
         self.embed_tokens = nn.Embedding(self.embed_dim, config.width)
-        # nn.init.normal_(self.embed_tokens.weight, std=0.02 * init_scale)
         self.embed_tokens_dropout = nn.Dropout(config.emb_dropout)
         self.metadata_conditioning = metadata_conditioning
         self.audio_conditioning = audio_conditioning
         if not metadata_conditioning:
             self.start_token = nn.Parameter(torch.empty((1, config.width)))
-            # nn.init.normal_(self.start_token, std=0.01 * init_scale)
         self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.width)
         self.pos_emb_dropout = nn.Dropout(config.emb_dropout)
 
@@ -1374,7 +1370,6 @@ def __init__(
             if self.share_embed_tokens_fc_proj_out:
                 self.fc_proj_out.weight = self.embed_tokens.weight
             self.loss = torch.nn.CrossEntropyLoss()
-            # nn.init.normal_(self.encoder.lm_head.weight, std=0.02 * decoder_config["init_scale"])
 
     def postprocess(self, tokens, sample_tokens=None):
         # Convert back from NL and long to NHWC
@@ -1698,17 +1693,6 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
-
-class JukeboxSimpleEmbedding(nn.Module):
-    def __init__(self, embed_dim, out_width):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.emb = nn.Embedding(embed_dim, out_width)
-
-    def forward(self, y):
-        return self.emb(y)
-
-
 class JukeboxRangeEmbedding(nn.Module):
     # Interpolating
     # Interpolate so that [pos_start, pos_end] <-> position tensor of length n_ctx
@@ -1723,7 +1707,6 @@ def __init__(self, n_time, embed_dim, range, out_width, clamp=False):
         self.n_time = n_time
         self.embed_dim = embed_dim
         self.emb = nn.Embedding(embed_dim, out_width)
-        # TODO add to init_weights nn.init.normal_(self.emb.weight, std=0.01 * init_scale)
         self.pos_min, self.pos_max = range
         self.clamp = clamp
 
@@ -1769,9 +1752,7 @@ def __init__(self, config, include_time_signal):
         music_tokens_shape = config.n_ctx
 
         self.max_nb_genres = config.max_nb_genres
-        # self.bow_genre_emb = JukeboxSimpleEmbedding(nb_genres, out_width)  # TODO check if that does not break anything
-        # self.artist_emb = JukeboxSimpleEmbedding(nb_artists, out_width)
-        self.bow_genre_emb = nn.Embedding(nb_genres, embed_dim)  # TODO maybe test that
+        self.bow_genre_emb = nn.Embedding(nb_genres, embed_dim)
         self.artist_emb = nn.Embedding(nb_artists, embed_dim)
         self.include_time_signal = include_time_signal  # add to config
         if self.include_time_signal:
@@ -1833,7 +1814,7 @@ class JukeboxPrior(nn.Module):
     the primed sample or sample functions. If the model is not trained using these/ uses the forward differently then I
     guess it is fine but otherwise it looks strange.
     """
-
+    config_class = JukeboxPriorConfig
     def __init__(
         self,
         config: JukeboxPriorConfig,
@@ -2281,47 +2262,21 @@ class JukeboxPreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-
-        # TODO handle all the initialisation and zero_out from the rest of the code
+        init_scale = self.config.init_scale
+
+        if isinstance(module, nn.Embedding): # embed_tokens
+            module.weight.data.normal_(mean=0.0, std= 0.02 * init_scale)
+        elif isinstance(module,nn.Parameter):
+            module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module,JukeboxPositionalEmbedding):
+            module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
         elif isinstance(module, JukeboxRangeEmbedding):
-            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
-        elif isinstance(module, JukeboxSimpleEmbedding):
-            factor = self.config.initializer_factor
-            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
-            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
-            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
-        elif isinstance(module, JukeboxAttention):
-            factor = self.config.initializer_factor
-            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-            out_proj_std = (module.embed_dim**-0.5) * factor
-            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
-            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
-            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
-            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
-        elif isinstance(module, JukeboxMLP):
-            factor = self.config.initializer_factor
-            in_proj_std = (
-                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
-            )
-            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
-            nn.init.normal_(module.fc1.weight, std=fc_std)
-            nn.init.normal_(module.fc2.weight, std=in_proj_std)
-        elif isinstance(module, JukeboxModel):
-            nn.init.normal_(
-                module.text_projection.weight,
-                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
-            )
-            nn.init.normal_(
-                module.visual_projection.weight,
-                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
-            )
+            module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module,nn.Linear) and "encoder.lm_head" in module.__class__.__name__:
+            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+
+        elif isinstance(module,JukeboxConv1D) and self.config.zero_out:
+            module.weight.data.zero_()
 
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
@@ -2585,7 +2540,7 @@ def _sample(
         )
         for level in sample_levels:
             sampling_kwargs = dict(
-                temp=0.99 if level == 0 else sampling_temperature,
+                temp=0.99 if level == len(self.priors) -1 else sampling_temperature,
                 max_batch_size=lower_batch_size if level != sample_levels else max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -2604,7 +2559,7 @@ def _sample(
             # Decode sample
             with torch.no_grad():
                 raw_audio = self.vqvae.decode(
-                    music_tokens[level:], start_level=level, bs_chunks=music_tokens[level].shape[0]
+                    music_tokens[:level+1], start_level=level, bs_chunks=music_tokens[level].shape[0]
                 )
 
             if save_results:
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 002d6293612b8..aff7368dd0b52 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -165,6 +165,7 @@ def test_slow_sampling(self):
         self.assertListEqual(y.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
 
         set_seed(0)
+        model.priors[0].cuda()
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))

From 15da9b5d6217f0ea5aac3204b5eac70f6c4980ff Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 3 Nov 2022 10:28:33 +0000
Subject: [PATCH 151/196] fix slopw tests and tokenizer order

---
 src/transformers/models/jukebox/tokenization_jukebox.py | 2 +-
 tests/models/jukebox/test_modeling_jukebox.py           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 49ab4669f01d5..ec5cbc45c3b7d 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -175,7 +175,7 @@ def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
             list_genres[genres] = [self.genres_encoder.get(genre, 0) for genre in list_genres[genres]]
             list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
 
-        lyric_ids = [[], [], [self.lyrics_encoder.get(character, 0) for character in list_lyrics[-1]]]
+        lyric_ids = [[self.lyrics_encoder.get(character, 0) for character in list_lyrics[-1]], [], []]
         return artists_id, list_genres, lyric_ids
 
     def _tokenize(self, lyrics):
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index aff7368dd0b52..53193b106df47 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -196,7 +196,7 @@ def test_primed_sampling(self):
 
         model.priors[1].cuda()
         upper_2 = torch.cat((zs[0], torch.zeros(1, 1000000 - zs[0].shape[-1]).cuda()), dim=-1).long()
-        zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), None]
+        zs = [upper_2, model.vqvae.encode(waveform.cuda(), start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), None]
         zs = model._sample(
             zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[-2].raw_to_tokens
         )
@@ -204,7 +204,7 @@ def test_primed_sampling(self):
 
         model.priors[2].cuda()
         upper_1 = torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[-1]).cuda()), dim=-1).long()
-        zs = [model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0].cuda(), upper_1, upper_2]
+        zs = [upper_2, upper_1, model.vqvae.encode(waveform.cuda(), start_level=0, bs_chunks=waveform.shape[0])[0].cuda()]
         zs = model._sample(
             zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
         )

From 289f99f72e5234d431fc0de833843938515c6253 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 3 Nov 2022 10:32:20 +0000
Subject: [PATCH 152/196] update readmes

---
 README_es.md      | 2 +-
 README_ko.md      | 2 +-
 README_zh-hans.md | 2 +-
 README_zh-hant.md | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README_es.md b/README_es.md
index 1df65e9f0cb05..b30811d8aa53c 100644
--- a/README_es.md
+++ b/README_es.md
@@ -317,7 +317,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[Jukebox](https://huggingface.co/docs/transformers/main/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/README_ko.md b/README_ko.md
index 0cdacad6dc393..09bf7aeee0354 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -267,7 +267,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
+1. **[Jukebox](https://huggingface.co/docs/transformers/main/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index b3249419e5292..4ccfc1a56be82 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -291,7 +291,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
+1. **[Jukebox](https://huggingface.co/docs/transformers/main/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 由 Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 2cc47e9abe5ee..de71dc85c0181 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -303,7 +303,7 @@ conda install -c huggingface transformers
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
-1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
+1. **[Jukebox](https://huggingface.co/docs/transformers/main/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever. 
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.

From 3584b211ac46b9f1c746d18967c8e0a07ff0b34e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 3 Nov 2022 10:33:28 +0000
Subject: [PATCH 153/196] fixcopies and fixup

---
 README_ja.md                                  | 1 +
 tests/models/jukebox/test_modeling_jukebox.py | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/README_ja.md b/README_ja.md
index eed7d204f8368..190c56ad434dc 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -354,6 +354,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Jukebox](https://huggingface.co/docs/transformers/main/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 53193b106df47..732f9d8bd94e4 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -204,7 +204,11 @@ def test_primed_sampling(self):
 
         model.priors[2].cuda()
         upper_1 = torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[-1]).cuda()), dim=-1).long()
-        zs = [upper_2, upper_1, model.vqvae.encode(waveform.cuda(), start_level=0, bs_chunks=waveform.shape[0])[0].cuda()]
+        zs = [
+            upper_2,
+            upper_1,
+            model.vqvae.encode(waveform.cuda(), start_level=0, bs_chunks=waveform.shape[0])[0].cuda(),
+        ]
         zs = model._sample(
             zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
         )

From 72e128a946b77c44a7a4c8e59d05415370969ce7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 3 Nov 2022 10:34:31 +0000
Subject: [PATCH 154/196] fixup

---
 .../models/jukebox/convert_jukebox.py         |  2 +-
 .../models/jukebox/modeling_jukebox.py        | 21 ++++++++++---------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 75e382de29adc..200166beacd54 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -63,7 +63,7 @@ def replace_key(key):
     if "prime_prior" in key:
         key = key.replace("prime_prior", "encoder")
 
-    if ".emb." in key and not "total" in key and not "absolute" in key and not "relative" in key:
+    if ".emb." in key and "total" not in key and "absolute" not in key and "relative" not in key:
         key = key.replace(".emb.", ".")
 
     if key.endswith("k"):  # replace vqvae.X.k with vqvae.X.codebook
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 8c499bb830972..c75e0d6063598 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1693,6 +1693,7 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
 
+
 class JukeboxRangeEmbedding(nn.Module):
     # Interpolating
     # Interpolate so that [pos_start, pos_end] <-> position tensor of length n_ctx
@@ -1814,7 +1815,9 @@ class JukeboxPrior(nn.Module):
     the primed sample or sample functions. If the model is not trained using these/ uses the forward differently then I
     guess it is fine but otherwise it looks strange.
     """
+
     config_class = JukeboxPriorConfig
+
     def __init__(
         self,
         config: JukeboxPriorConfig,
@@ -2022,7 +2025,6 @@ def prior_postprocess(self, tokens):
 
         # Some of the input tokens might be shifted to take into account the voccabulary fusion
         for i in range(len(tokens)):
-            shape = dims[i]
             bins_shift = int(self.embed_dim_shift[i])
             tokens[i] = (tokens[i] - bins_shift).view(batch_size, -1)
             tokens[i] = torch.clamp(tokens[i], min=0)
@@ -2261,21 +2263,20 @@ class JukeboxPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = False
 
     def _init_weights(self, module):
-        std = self.config.init_std
         init_scale = self.config.init_scale
 
-        if isinstance(module, nn.Embedding): # embed_tokens
-            module.weight.data.normal_(mean=0.0, std= 0.02 * init_scale)
-        elif isinstance(module,nn.Parameter):
+        if isinstance(module, nn.Embedding):  # embed_tokens
+            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, nn.Parameter):
             module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
-        elif isinstance(module,JukeboxPositionalEmbedding):
+        elif isinstance(module, JukeboxPositionalEmbedding):
             module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
         elif isinstance(module, JukeboxRangeEmbedding):
             module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale)
-        elif isinstance(module,nn.Linear) and "encoder.lm_head" in module.__class__.__name__:
+        elif isinstance(module, nn.Linear) and "encoder.lm_head" in module.__class__.__name__:
             module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
 
-        elif isinstance(module,JukeboxConv1D) and self.config.zero_out:
+        elif isinstance(module, JukeboxConv1D) and self.config.zero_out:
             module.weight.data.zero_()
 
         if isinstance(module, nn.LayerNorm):
@@ -2540,7 +2541,7 @@ def _sample(
         )
         for level in sample_levels:
             sampling_kwargs = dict(
-                temp=0.99 if level == len(self.priors) -1 else sampling_temperature,
+                temp=0.99 if level == len(self.priors) - 1 else sampling_temperature,
                 max_batch_size=lower_batch_size if level != sample_levels else max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
@@ -2559,7 +2560,7 @@ def _sample(
             # Decode sample
             with torch.no_grad():
                 raw_audio = self.vqvae.decode(
-                    music_tokens[:level+1], start_level=level, bs_chunks=music_tokens[level].shape[0]
+                    music_tokens[: level + 1], start_level=level, bs_chunks=music_tokens[level].shape[0]
                 )
 
             if save_results:

From 944303851e84d8c69654ecf6c833851e5f963f6f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 3 Nov 2022 10:55:46 +0000
Subject: [PATCH 155/196] update tips in the readme

---
 docs/source/en/model_doc/jukebox.mdx | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 556588eda19ec..a7b7e51ce9a76 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -29,8 +29,10 @@ The metadata such as *artist, genre and timing* are passed to each prior, in the
 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
 
 Tips:
-- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU.
-- Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`. 
+- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory.
+- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, either use accelerate or refer the the example notbook which should provide a wrapper.
+- Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
+- Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
 
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
 The original code can be found [here](https://github.com/openai/jukebox).
@@ -41,21 +43,21 @@ The original code can be found [here](https://github.com/openai/jukebox).
 
 ## JukeboxTokenizer
 
-[[autodoc]] JukeboxTokenizer 
+[[autodoc]] JukeboxTokenizer
     - save_vocabulary
 
 ## JukeboxModel
 
-[[autodoc]] JukeboxModel 
-    - ancestral_sample 
+[[autodoc]] JukeboxModel
+    - ancestral_sample
     - primed_sample
     - continue_sample
     - upsample
     - _sample
 
-## JukeboxVQVAE 
+## JukeboxVQVAE
 
-[[autodoc]] JukeboxVQVAE 
-    - forward 
-    - encode 
+[[autodoc]] JukeboxVQVAE
+    - forward
+    - encode
     - decode

From c55a843342b28b2c7fd114e202583701125c9006 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 4 Nov 2022 17:09:33 +0000
Subject: [PATCH 156/196] update code

---
 .../models/jukebox/modeling_jukebox.py        | 50 ++++++++++---------
 .../models/jukebox/tokenization_jukebox.py    |  6 +--
 tests/models/jukebox/test_modeling_jukebox.py | 23 ++++++---
 .../jukebox/test_tokenization_jukebox.py      | 12 ++---
 4 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index c75e0d6063598..9c289db01be48 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -148,16 +148,15 @@ def get_alignment(music_tokens, labels, prior, config):
             w_hop = prior.forward_tokens(tokens_i[:, start:end], [], metadata_i, get_attn_weights=attn_layers)
             w_hops.append(w_hop[0][:, alignment_head])
             del w_hop
-        w = torch.cat(w_hops, dim=0)
+        weights = torch.cat(w_hops, dim=0)
         del w_hops
-        alignment_hop = w.float().cpu().numpy()
+        alignment_hop = weights.float().cpu().numpy()
         del w
 
         # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens)
         # indices_hop is a list of len=bs, each entry of len hps.nb_relevant_lyric_tokens
         indices_hops[start] = indices_hop
         alignment_hops[start] = alignment_hop
-    prior.cpu()
 
     # Combine attn for each hop into attn for full range
     # Use indices to place them into correct place for corresponding source tokens
@@ -1452,7 +1451,7 @@ def forward(
     def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning):
         if sample_t == 0:
             hidden_states = torch.empty(n_samples, 1, self.width, dtype=self.embed_tokens.weight.dtype).to(
-                audio_conditioning.device
+                self.embed_tokens.weight.device
             )
             if self.metadata_conditioning:
                 hidden_states[:, 0] = metadata_conditioning.view(n_samples, self.width)
@@ -1906,7 +1905,7 @@ def __init__(
         self.total_loss_dims = self.lyrics_enc_loss_dims + self.next_token_prediction_loss_dims
 
         self.downsamples = [stride**down for stride, down in zip(config.res_strides_t, config.res_downs_t)]
-        self.cond_downsample = self.downsamples[level - 1] if level != 0 else None
+        self.cond_downsample = self.downsamples[level] if level != 0 else None
         self.raw_to_tokens = np.prod(self.downsamples[: nb_priors - level])
         self.sample_length = self.n_ctx * self.raw_to_tokens
 
@@ -2035,7 +2034,7 @@ def embed_tokens(self, music_tokens_conds):
         """
         Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
         """
-        music_tokens_conds = music_tokens_conds[: self.cond_level]
+        music_tokens_conds = music_tokens_conds[: self.cond_level+1]
         audio_conditioning = None
         for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, [self.conditioner_blocks]))):
             audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning)
@@ -2267,18 +2266,24 @@ def _init_weights(self, module):
 
         if isinstance(module, nn.Embedding):  # embed_tokens
             module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
-        elif isinstance(module, nn.Parameter):
-            module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxConv1D):
+            if self.config.zero_out:
+                module.weight.data.zero_()
+            else :
+                module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
         elif isinstance(module, JukeboxPositionalEmbedding):
             module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
         elif isinstance(module, JukeboxRangeEmbedding):
             module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale)
         elif isinstance(module, nn.Linear) and "encoder.lm_head" in module.__class__.__name__:
             module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
-
-        elif isinstance(module, JukeboxConv1D) and self.config.zero_out:
-            module.weight.data.zero_()
-
+        elif isinstance(module, nn.Parameter) and "pos_emb" in module.__class__.__name__:
+            module.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, nn.Parameter) and "start_token" in module.__class__.__name__:
+            module.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
+            module.conv1d_2.weigth.data.zero_()
+            module.conv1d_2.bias.data.zero_()
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
@@ -2381,10 +2386,9 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
 
         else:
             sample_tokens = end - start
-        conditioning_tokens, new_tokens = (
-            previous_sampled_tokens.shape[1],
-            sample_tokens - previous_sampled_tokens.shape[1],
-        )
+
+        conditioning_tokens = previous_sampled_tokens.shape[1]
+        new_tokens = sample_tokens - previous_sampled_tokens.shape[1]
 
         logger.info(
             f"Sampling {sample_tokens} tokens for [{start},{start+sample_tokens}]. Conditioning on"
@@ -2555,15 +2559,13 @@ def _sample(
                 music_tokens, labels[level], offset, sampling_kwargs, level, total_token_to_sample, hop_length
             )
 
-            # todo add wrapper to automatically send to cpu if unused
-            self.vqvae.to(music_tokens[level].device)
-            # Decode sample
-            with torch.no_grad():
-                raw_audio = self.vqvae.decode(
-                    music_tokens[: level + 1], start_level=level, bs_chunks=music_tokens[level].shape[0]
-                )
-
             if save_results:
+                self.vqvae.to(music_tokens[level].device)
+                # Decode sample
+                with torch.no_grad():
+                    raw_audio = self.vqvae.decode(
+                        music_tokens[: level + 1], start_level=level, bs_chunks=music_tokens[level].shape[0]
+                    )
                 logdir = f"jukebox/level_{level}"
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index ec5cbc45c3b7d..0091c89d80947 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -175,7 +175,7 @@ def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
             list_genres[genres] = [self.genres_encoder.get(genre, 0) for genre in list_genres[genres]]
             list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
 
-        lyric_ids = [[self.lyrics_encoder.get(character, 0) for character in list_lyrics[-1]], [], []]
+        lyric_ids = [[self.lyrics_encoder.get(character, 0) for character in list_lyrics[0]], [], []]
         return artists_id, list_genres, lyric_ids
 
     def _tokenize(self, lyrics):
@@ -229,7 +229,7 @@ def prepare_for_tokenization(
                     self._normalize(genre) + ".v2" for genre in genres[idx].split("_")
                 ]  # split is for the full dictionnary with combined genres
 
-        if self.version[-1] == "v2":
+        if self.version[0] == "v2":
             self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
             vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
             self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
@@ -243,7 +243,7 @@ def prepare_for_tokenization(
 
         lyrics = self._run_strip_accents(lyrics)
         lyrics = lyrics.replace("\\", "\n")
-        lyrics = [], [], self.out_of_vocab.sub("", lyrics)
+        lyrics = self.out_of_vocab.sub("", lyrics), [], []
         return artists, genres, lyrics
 
     def _run_strip_accents(self, text):
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 732f9d8bd94e4..37e28e21f42af 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -159,22 +159,26 @@ def test_slow_sampling(self):
         top_prior = model.priors[0]
         start = 0
         z_conds = top_prior.get_music_tokens_conds(zs, start=start, end=start + top_prior.n_ctx)
-        y = top_prior.get_metadata(labels[-1].clone(), start, 1058304, 0)
+        y = top_prior.get_metadata(labels[0].clone(), start, 1058304, 0)
 
         self.assertIsNone(z_conds)
         self.assertListEqual(y.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
 
         set_seed(0)
-        model.priors[0].cuda()
+        top_prior.cuda()
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        top_prior.cpu()
 
         set_seed(0)
+        model.priors[1].cuda()
         zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+        model.priors[1].cpu()
 
         set_seed(0)
+        model.priors[2].cuda()
         zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
@@ -193,21 +197,23 @@ def test_primed_sampling(self):
             zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[0].raw_to_tokens
         )
         assert torch.allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
+        model.priors[0].cpu()
 
         model.priors[1].cuda()
         upper_2 = torch.cat((zs[0], torch.zeros(1, 1000000 - zs[0].shape[-1]).cuda()), dim=-1).long()
-        zs = [upper_2, model.vqvae.encode(waveform.cuda(), start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), None]
+        zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), None]
         zs = model._sample(
             zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[-2].raw_to_tokens
         )
         assert torch.allclose(zs[1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_1))
+        model.priors[1].cpu()
 
         model.priors[2].cuda()
         upper_1 = torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[-1]).cuda()), dim=-1).long()
         zs = [
             upper_2,
             upper_1,
-            model.vqvae.encode(waveform.cuda(), start_level=0, bs_chunks=waveform.shape[0])[0].cuda(),
+            model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0].cuda(),
         ]
         zs = model._sample(
             zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
@@ -330,15 +336,20 @@ def test_slow_sampling(self):
         labels = [i.cuda() for i in self.prepare_inputs(self.model_id)]
 
         set_seed(0)
+        model.priors[0].cuda()
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        model.priors[0].cpu()
 
         set_seed(0)
-        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[-2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+        model.priors[1].cuda()
+        zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
+        assert torch.allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+        model.priors[1].cpu()
 
         set_seed(0)
+        model.priors[2].cuda()
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
         assert torch.allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
diff --git a/tests/models/jukebox/test_tokenization_jukebox.py b/tests/models/jukebox/test_tokenization_jukebox.py
index beac3ec246c81..7ce2585bdd64b 100644
--- a/tests/models/jukebox/test_tokenization_jukebox.py
+++ b/tests/models/jukebox/test_tokenization_jukebox.py
@@ -53,8 +53,6 @@ def test_1b_lyrics_tokenizer(self):
         tokens = tokenizer(**self.metas)["input_ids"]
         # fmt: off
         EXPECTED_OUTPUT = [
-            torch.tensor([[0, 0, 0, 1069, 11]]),
-            torch.tensor([[0, 0, 0, 1069, 11]]),
             torch.tensor([[
                 0, 0, 0, 7169, 507, 9, 76, 39, 31, 46, 76, 27,
                 76, 46, 44, 27, 48, 31, 38, 38, 31, 44, 76, 32,
@@ -118,7 +116,9 @@ def test_1b_lyrics_tokenizer(self):
                 76, 38, 31, 48, 31, 38, 76, 45, 27, 40, 30, 45,
                 76, 45, 46, 44, 31, 46, 29, 34, 76, 32, 27, 44,
                 76, 27, 49, 27, 51, 78, 76, 76, 76, 76, 76, 76,
-                76, 76]])
+                76, 76]]),
+            torch.tensor([[0, 0, 0, 1069, 11]]),
+            torch.tensor([[0, 0, 0, 1069, 11]]),
         ]
         # fmt: on
         self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))
@@ -136,8 +136,6 @@ def test_5b_lyrics_tokenizer(self):
         tokens = tokenizer(**self.metas)["input_ids"]
         # fmt: off
         EXPECTED_OUTPUT = [
-            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
-            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
             torch.tensor([[
                 0, 0, 0, 1069, 11, -1, -1, -1, -1, 9, 77, 39,
                 31, 46, 77, 27, 77, 46, 44, 27, 48, 31, 38, 38,
@@ -201,7 +199,9 @@ def test_5b_lyrics_tokenizer(self):
                 77, 27, 40, 30, 77, 38, 31, 48, 31, 38, 77, 45,
                 27, 40, 30, 45, 77, 45, 46, 44, 31, 46, 29, 34,
                 77, 32, 27, 44, 77, 27, 49, 27, 51, 79, 77, 77,
-                77, 77, 77, 77, 77, 77]])
+                77, 77, 77, 77, 77, 77]]),
+            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
+            torch.tensor([[0, 0, 0, 1069, 11, -1, -1, -1, -1]]),
         ]
         # fmt: on
         self.assertTrue(torch.allclose(tokens[0], EXPECTED_OUTPUT[0]))

From d4917c98522187f9b114c34cb7a18944b437c470 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 15:55:26 +0000
Subject: [PATCH 157/196] renaming here and there + start fixing last tests

---
 .../models/jukebox/configuration_jukebox.py   | 25 +++---
 .../models/jukebox/convert_jukebox.py         |  2 +-
 .../models/jukebox/modeling_jukebox.py        | 87 ++++++++-----------
 tests/models/jukebox/test_modeling_jukebox.py | 65 +++++++++-----
 4 files changed, 92 insertions(+), 87 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 16d4ae4bbb02f..1e97645465557 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -122,9 +122,9 @@
     "large_separated_enc_dec_w_lyrics": lambda layer: _LARGE_ATTENTION[
         layer % 79
     ],  # Used by large separated_enc_dec model with lyrics
-    "single_enc_dec_w_lyrics": lambda layer: _PrimePrimeDenseAttention[layer % 3]
+    "enc_dec_with_lyrics": lambda layer: _PrimePrimeDenseAttention[layer % 3]
     if layer % 16 == 15
-    else _RawColumnPreviousRowAttention[layer % 3],  # Used by single_enc_dec model with lyrics
+    else _RawColumnPreviousRowAttention[layer % 3],  # Used by encoder_decoder model with lyrics
 }
 
 
@@ -144,7 +144,7 @@ class JukeboxPriorConfig(PretrainedConfig):
         metadata_dims (`List[Tuple[int, int]]`, *optional*, defaults to `[(604, 7898), (120, 4111), (120, 4111)]`):
             List containing the number of genres and the number of artists that were used to train the embedding layers
             of each of the prior models.
-        single_enc_dec (`List[bool]`, *optional*, defaults to `[True, False, False]`):
+        is_encoder_decoder (`List[bool]`, *optional*, defaults to `[True, False, False]`):
             Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
             `encoderoder` for each of the priors.
         merged_decoder (`list`, *optional*, defaults to [True, False, False]):
@@ -266,7 +266,7 @@ def __init__(
         res_conv_width=128,
         res_dilation_growth_rate=1,
         res_dilation_cycle=None,
-        res_scale=None,
+        conv_res_scale=None,
         res_convolution_multiplier=1,
         res_downs_t=(3, 2, 2),
         res_strides_t=(2, 2, 2),
@@ -287,15 +287,16 @@ def __init__(
         encoder_res_scale=False,
         encoder_n_vocab=79,
         init_scale=0.2,
+        attn_res_scale=False,
         n_ctx=6144,
         width=2048,
         depth=72,
         n_heads=2,
-        attention_pattern="single_enc_dec_w_lyrics",
+        attention_pattern="enc_dec_with_lyrics",
         alignment_layer=68,
         alignment_head=2,
         metadata_dims=(604, 7898),
-        single_enc_dec=True,
+        is_encoder_decoder=True,
         merged_decoder=True,
         lyric_conditioning=True,
         nb_relevant_lyric_tokens=384,
@@ -325,7 +326,7 @@ def __init__(
         self.resid_dropout = resid_dropout
         self.emb_dropout = emb_dropout
         self.zero_out = zero_out
-        self.res_scale = res_scale
+        self.conv_res_scale = conv_res_scale
         self.blocks = blocks
         self.attention_multiplier = attention_multiplier
         self.mlp_multiplier = mlp_multiplier
@@ -341,13 +342,13 @@ def __init__(
         self.res_dilation_cycle = res_dilation_cycle
         self.zero_out = zero_out
         self.res_convolution_multiplier = res_convolution_multiplier
-        self.res_scale = res_scale
+        self.attn_res_scale = attn_res_scale
         self.res_downs_t = res_downs_t
         self.res_strides_t = res_strides_t
 
         # Lyric conditioning
         self.merged_decoder = merged_decoder  # is this equivalent ?
-        self.single_enc_dec = single_enc_dec
+        self.is_encoder_decoder = is_encoder_decoder
         self.lyric_conditioning = lyric_conditioning
         self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
 
@@ -454,7 +455,7 @@ def __init__(
         res_convolution_multiplier=1,
         res_dilation_growth_rate=3,
         res_dilation_cycle=None,
-        res_scale=False,
+        conv_res_scale=False,
         act_fn="relu",
         **kwargs
     ):
@@ -476,7 +477,7 @@ def __init__(
         self.res_strides_t = res_strides_t
         self.lmu = lmu
         self.commit = commit
-        self.res_scale = res_scale
+        self.conv_res_scale = conv_res_scale
         self.act_fn = act_fn
 
     @classmethod
@@ -525,7 +526,7 @@ class JukeboxConfig(PretrainedConfig):
         metadata_conditioning (`bool`, *optional*, defaults to `True`):
             Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
             duration.
-        single_enc_dec (`List[bool]`, *optional*, defaults to `[True, False, False]`):
+        is_encoder_decoder (`List[bool]`, *optional*, defaults to `[True, False, False]`):
             Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
             `encoderoder` for each of the priors.
         merged_decoder (`list`, *optional*, defaults to [True, False, False]):
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 200166beacd54..1921d628e71dd 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -235,7 +235,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
     #     prior_n_ctx=[8192, 8192, 8192],
     #     prime_width=[1280, 128, 128],
     #     prior_width=[4800, 1920, 1920],
-    #     single_enc_dec=[False, False, False],
+    #     is_encoder_decoder=[False, False, False],
     #     timing_dims=128,
     #     vqvae_width=64,
     #     metadata_dims=[(120, 4111), (120, 4111), (120, 4111)],
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 9c289db01be48..2b3a751a74e3c 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -258,7 +258,7 @@ class JukeboxResnet1D(nn.Module):
     def __init__(self, config, conv_width, n_depth, reverse_dilation=False):
         super().__init__()
         self.dilation_cycle = config.res_dilation_cycle
-        res_scale = 1.0 if not config.res_scale else 1.0 / math.sqrt(n_depth)
+        res_scale = 1.0 if not config.conv_res_scale else 1.0 / math.sqrt(n_depth)
 
         blocks = []
         for depth in range(n_depth):
@@ -322,7 +322,7 @@ def forward(self, hidden_states):
 
 
 class JukeboxDecoderConvBock(nn.Module):
-    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t):
+    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t, reverse_dilation = True):
         self.embed_dim = embed_dim
         self.hidden_dim = hidden_dim
         super().__init__()
@@ -332,7 +332,7 @@ def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t):
             pad_t = stride_t // 2
             self.proj_in = nn.Conv1d(embed_dim, hidden_dim, 3, 1, 1)
             for i in range(down_t):
-                blocks.append(JukeboxResnet1D(config, hidden_dim, depth, reverse_dilation=True))
+                blocks.append(JukeboxResnet1D(config, hidden_dim, depth, reverse_dilation))
                 blocks.append(
                     nn.ConvTranspose1d(
                         hidden_dim, hidden_dim if i < down_t - 1 else embed_dim, filter_t, stride_t, pad_t
@@ -820,7 +820,7 @@ def __init__(
         self.head_dim = hidden_dim // config.n_heads
         self.n_ctx = n_ctx  # NOTE: n_ctx could be different within operations. This is complete n_ctx
         self.hidden_dim = hidden_dim
-        self.scale = self.head_dim**-0.25  # TODO check  1.0 / math.sqrt(math.sqrt(self.hidden_dim // self.n_heads))
+        self.scale = self.head_dim**-0.25
         self.mask = config.mask
 
         if attn_func == "cross_attention":
@@ -830,8 +830,8 @@ def __init__(
             self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim * 3)
 
         self.c_proj = JukeboxConv1D(hidden_dim, self.embed_dim)
-        self.attn_dropout = nn.Dropout(config.attn_dropout) if config.attn_dropout > 0.0 else lambda x: x
-        self.resid_dropout = nn.Dropout(config.resid_dropout) if config.attn_dropout > 0.0 else lambda x: x
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.resid_dropout = nn.Dropout(config.resid_dropout)
 
         # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
         self.attn_func = attn_func
@@ -1142,26 +1142,6 @@ def _suff_cache_len(self):
             key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
             location in the context.
         """
-        if self.attn_func == "dense_attn":
-            return self.sample_t
-        elif self.attn_func == "block_attn":
-            return (self.sample_t - 1) % self.block_ctx + 1
-        elif self.attn_func == "transpose_block_attn":
-            return self.sample_t
-        elif self.attn_func == "prev_block_attn":
-            if self.sample_t <= self.block_ctx:
-                return self.sample_t
-            else:
-                curr_block = (self.sample_t - 1) % self.block_ctx + 1
-                prev_block = self.block_ctx
-                return curr_block + prev_block
-        elif self.attn_func == "cross_attn":
-            return self.encoder_dims
-        elif self.attn_func == "prime_attn":
-            return min(self.sample_t, self._lyric_enc_len)
-        else:
-            raise NotImplementedError()
-
         REQUIRED_CACHE_LEN = {
             "dense_attn": self.sample_t,
             "block_attn": (self.sample_t - 1) % self.block_ctx + 1,
@@ -1224,7 +1204,7 @@ def __init__(
         self.layer_norm_0 = JukeboxLayerNorm(config.width)
         self.mlp = JukeboxMLP(config)
         self.layer_norm_1 = JukeboxLayerNorm(config.width)
-        self.res_scale = 1.0 / config.depth if config.res_scale else 1.0
+        self.res_scale = 1.0 / config.depth if config.attn_res_scale else 1.0
         self.attn_func = attn_func
 
     def forward(self, hidden_states, last_encoder_hidden_states, sample=False):
@@ -1468,7 +1448,6 @@ def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_cond
         )  # Pos emb, dropout is identity at eval time
         return hidden_states, cond
 
-    # Could this be made compatible with generate
     def sample(
         self,
         n_samples,
@@ -1517,6 +1496,7 @@ def sample(
                     logits=hidden_states
                 ).sample()  # Sample and replace hidden_states
                 sampled_tokens.append(tokens.clone())
+
             del tokens
             self.transformer.del_cache()
 
@@ -1537,7 +1517,7 @@ def split_chunks(self, length, chunk_size):
     def primed_sample(
         self,
         n_samples,
-        music_tokens,
+        lyric_and_music_tokens,
         audio_conditioning=None,
         metadata_conditioning=None,
         last_encoder_hidden_states=None,
@@ -1551,17 +1531,17 @@ def primed_sample(
         if sample_tokens is None:
             sample_tokens = self.embed_dim
         # Preprocess.
-        batch_size = music_tokens.shape[0]
+        batch_size = lyric_and_music_tokens.shape[0]
         with torch.no_grad():
-            music_tokens = music_tokens.view(batch_size, -1).long()
+            lyric_and_music_tokens = lyric_and_music_tokens.view(batch_size, -1).long()
 
-        sampled_audio = torch.split(music_tokens, 1, dim=1)
+        sampled_audio = torch.split(lyric_and_music_tokens, 1, dim=1)
         sampled_audio = list(sampled_audio)
 
         if not self.audio_conditioning:
             audio_conditioning = torch.zeros(
                 (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
-            ).to(music_tokens.device)
+            ).to(lyric_and_music_tokens.device)
 
         with torch.no_grad():
             if get_preds:
@@ -1574,15 +1554,15 @@ def primed_sample(
             chunk_sizes = self.split_chunks(len(sampled_audio), chunk_size)
             x_primes = []
             start = 0
-            music_tokens = None
+            token = None
 
             for current_chunk_size in tqdm(chunk_sizes, desc="Preparing past key value", leave=False):
                 sampled_audio_prime, conds_prime = [], []
                 for sample_t in range(start, start + current_chunk_size):
                     x_prime, cond_prime = self.get_emb(
-                        sample_t, n_samples, music_tokens, audio_conditioning, metadata_conditioning
+                        sample_t, n_samples, token, audio_conditioning, metadata_conditioning
                     )
-                    music_tokens = sampled_audio[sample_t]
+                    token = sampled_audio[sample_t]
                     sampled_audio_prime.append(x_prime)
                     conds_prime.append(cond_prime)
                 start = start + current_chunk_size
@@ -1606,13 +1586,14 @@ def primed_sample(
                 x_prime = self.fc_proj_out(x_prime)  # Predictions
                 preds.append(x_prime)
 
-            music_tokens = sampled_audio[-1]
+            # the input of the encoder and decoder can be merged into (lyrics, music tokens)
+            input_tokens = sampled_audio[-1]
 
             iter = tqdm(range(len(sampled_audio), sample_tokens))
             for sample_t in iter:
                 iter.set_description(f"Primed sampling {len(iter)} music tokens", refresh=True)
                 hidden_states, cond = self.get_emb(
-                    sample_t, n_samples, music_tokens, audio_conditioning, metadata_conditioning
+                    sample_t, n_samples, input_tokens, audio_conditioning, metadata_conditioning
                 )
 
                 hidden_states = self.transformer(
@@ -1626,12 +1607,12 @@ def primed_sample(
                 # Adjust logits
                 hidden_states = hidden_states / temp
                 hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
-                music_tokens = torch.distributions.Categorical(
-                    logits=hidden_states
-                ).sample()  # Sample and replace hidden_states
+                # only music tokens are sampled
+                music_tokens = torch.distributions.Categorical(logits=hidden_states).sample()
                 sampled_audio.append(music_tokens.clone())
+                input_tokens = music_tokens
 
-            del music_tokens
+            del input_tokens, music_tokens
             self.transformer.del_cache()
 
             music_tokens = torch.cat(sampled_audio, dim=1)
@@ -1666,6 +1647,7 @@ def __init__(self, config, level):
             config.res_conv_depth,
             config.res_downs_t[level],
             config.res_strides_t[level],
+            reverse_dilation = False
         )
         self.layer_norm = JukeboxLayerNorm(config.width)
 
@@ -1797,7 +1779,7 @@ def forward(self, metadata):
         return start_emb, pos_emb
 
 
-class JukeboxPrior(nn.Module):
+class JukeboxPrior(PreTrainedModel):
     """
     Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
     on the timing, genre and artist, we use the LabelConditioner class To condition on the codes from the level above,
@@ -1825,7 +1807,7 @@ def __init__(
         vqvae_encoder=None,
         vqvae_decoder=None,
     ):
-        super().__init__()
+        super().__init__(config)
         # Passing functions instead of the vqvae module to avoid getting params, only used in the
         # forward loop
         self.vqvae_encoder = vqvae_encoder
@@ -1852,8 +1834,8 @@ def __init__(
             self.metadata_embedding = LabelConditioner(config, include_time_signal=not self.audio_conditioning)
 
         # define encoder-decoder or encoder and decoder
-        self.single_enc_dec = config.single_enc_dec
-        if config.single_enc_dec:
+        self.is_encoder_decoder = config.is_encoder_decoder
+        if config.is_encoder_decoder:
             # encoder-decoder transformer
             self.input_shapes = [config.nb_relevant_lyric_tokens, config.n_ctx]
             self.embed_dim_shift = [0, config.encoder_n_vocab]
@@ -1872,7 +1854,7 @@ def __init__(
 
         else:
             # Separate encoder-decoder transformer
-            #  we have to modify the config to use the encoder variables
+            #  we have to modify the config to use the encoder variables for the lyric encoder
             encoder_config = self._get_encoder_config(config)
 
             if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
@@ -1995,8 +1977,8 @@ def get_music_tokens_conds(self, music_tokens, start, end):
 
     def prior_preprocess(self, tokens, conds):
         """
-        Shifts the input tokens to account for the dictionnary merge. The prior_embed_dim_shift give by how much. the
-        music tokens should be shifted by + nb_vocab.
+        Shifts the input tokens to account for the dictionnary merge. The embed_dim_shift give by how much the
+        music tokens should be shifted by. It is equal to encoder_n_vocab.
         """
         batch_size = tokens[0].shape[0]
         for i in range(len(tokens)):
@@ -2097,7 +2079,8 @@ def sample(
         sample_tokens=None,
     ):
         """
-        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas
+        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas.
+        music_tokens : previously sampled music tokens that are attended to by the prior.
         """
         no_past_context = music_tokens is None or music_tokens.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
@@ -2106,7 +2089,7 @@ def sample(
         with torch.no_grad():
             # Currently audio_conditioning only uses immediately above layer
             audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
-            if self.single_enc_dec:
+            if self.is_encoder_decoder:
                 if no_past_context:  # the prime_sample function will be used with music_tokens set to None
                     lyric_and_music_tokens, audio_conditioning = self.prior_preprocess(
                         [lyric_tokens], [None, audio_conditioning]
@@ -2201,7 +2184,7 @@ def forward_tokens(
             self.prior.transformer.set_record_attn(get_attn_weights)
         audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
 
-        if self.single_enc_dec:  # the preprocess returns the full tokens (Lyrics and Music tokens), shifted
+        if self.is_encoder_decoder:  # the preprocess returns the full tokens (Lyrics and Music tokens), shifted
             tokens, audio_conditioning = self.prior_preprocess(
                 [lyric_tokens, music_tokens], [None, audio_conditioning]
             )
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 37e28e21f42af..ff43983fdf10b 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -121,6 +121,22 @@ class Jukebox1bModelTester(unittest.TestCase):
         0.0740, 0.0889, 0.1023, 0.1162, 0.1211, 0.1212, 0.1251, 0.1336,
         0.1502, 0.1686, 0.1883, 0.2148, 0.2363, 0.2458, 0.2507, 0.2531
     ]
+    EXPECTED_AUDIO_COND = [
+        0.0256, -0.0544, 0.1600, -0.0032, 0.1066, 0.0825, -0.0013, 0.3440,
+        0.0210, 0.0412, -0.1777, -0.0892, -0.0164, 0.0285, -0.0613, -0.0617,
+        -0.0137, -0.0201, -0.0175, 0.0215, -0.0627, 0.0520, -0.0730, 0.0970,
+        -0.0100, 0.0442, -0.0586, 0.0207, -0.0015, -0.0082
+    ]
+    EXPECTED_META_COND = [
+         0.0415, 0.0877, 0.0022, -0.0055, 0.0751, 0.0334, 0.0324, -0.0068,
+         0.0011, 0.0017, -0.0676, 0.0655, -0.0143, 0.0399, 0.0303, 0.0743,
+        -0.0168, -0.0394, -0.1113, 0.0124, 0.0442, 0.0267, -0.0003, -0.1536,
+        -0.0116, -0.1837, -0.0180, -0.1026, -0.0777, -0.0456
+    ]
+    EXPECTED_LYRIC_COND = [
+        76, 27, 40, 30, 76, 46, 44, 47, 40, 37, 38, 31, 45, 45, 76, 38, 31, 33,
+        45, 76, 41, 32, 76, 45, 46, 41, 40, 31, 78, 76
+    ]
     # fmt: on
 
     def prepare_inputs(self):
@@ -136,15 +152,15 @@ def test_sampling(self):
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_2))
+        torch.testing.assert_allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         set_seed(0)
         zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
+        torch.testing.assert_allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         set_seed(0)
         zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
+        torch.testing.assert_allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     def test_slow_sampling(self):
@@ -158,29 +174,34 @@ def test_slow_sampling(self):
 
         top_prior = model.priors[0]
         start = 0
-        z_conds = top_prior.get_music_tokens_conds(zs, start=start, end=start + top_prior.n_ctx)
-        y = top_prior.get_metadata(labels[0].clone(), start, 1058304, 0)
+        music_token_conds = top_prior.get_music_tokens_conds(zs, start=start, end=start + top_prior.n_ctx)
+        metadata = top_prior.get_metadata(labels[0].clone(), start, 1058304, 0)
+
+        self.assertIsNone(music_token_conds)
+        self.assertListEqual(metadata.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
 
-        self.assertIsNone(z_conds)
-        self.assertListEqual(y.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
+        audio_conditioning, metadata_conditioning, lyric_tokens = top_prior.get_cond(music_token_conds, metadata.cpu())
+        torch.testing.assert_allclose(audio_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_AUDIO_COND), atol = 1e-4, rtol= 1e-4)
+        torch.testing.assert_allclose(metadata_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_META_COND), atol = 1e-4, rtol= 1e-4)
+        torch.testing.assert_allclose(lyric_tokens[0,:30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol = 1e-4, rtol= 1e-4)
 
         set_seed(0)
         top_prior.cuda()
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        torch.testing.assert_allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
         top_prior.cpu()
 
         set_seed(0)
         model.priors[1].cuda()
         zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+        torch.testing.assert_allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
         model.priors[1].cpu()
 
         set_seed(0)
         model.priors[2].cuda()
         zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
+        torch.testing.assert_allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
     @slow
     def test_primed_sampling(self):
@@ -196,7 +217,7 @@ def test_primed_sampling(self):
         zs = model._sample(
             zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[0].raw_to_tokens
         )
-        assert torch.allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
+        torch.testing.assert_allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
         model.priors[0].cpu()
 
         model.priors[1].cuda()
@@ -205,7 +226,7 @@ def test_primed_sampling(self):
         zs = model._sample(
             zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[-2].raw_to_tokens
         )
-        assert torch.allclose(zs[1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_1))
+        torch.testing.assert_allclose(zs[1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_1))
         model.priors[1].cpu()
 
         model.priors[2].cuda()
@@ -218,7 +239,7 @@ def test_primed_sampling(self):
         zs = model._sample(
             zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
         )
-        assert torch.allclose(zs[2][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
+        torch.testing.assert_allclose(zs[2][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
 
     @slow
     def test_vqvae(self):
@@ -227,11 +248,11 @@ def test_vqvae(self):
         x = torch.rand((1, 5120, 1))
         with torch.no_grad():
             zs = model.vqvae.encode(x, start_level=2, bs_chunks=x.shape[0])
-        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_VQVAE_ENCODE))
+        torch.testing.assert_allclose(zs[0][0], torch.tensor(self.EXPECTED_VQVAE_ENCODE))
 
         with torch.no_grad():
             x = model.vqvae.decode(zs, start_level=2, bs_chunks=x.shape[0])
-        assert torch.allclose(x[0, :40, 0], torch.tensor(self.EXPECTED_VQVAE_DECODE), atol=1e-4)
+        torch.testing.assert_allclose(x[0, :40, 0], torch.tensor(self.EXPECTED_VQVAE_DECODE), atol=1e-4, rtol= 1e-4)
 
 
 @require_torch
@@ -320,15 +341,15 @@ def test_sampling(self):
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cpu() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_2))
+        torch.testing.assert_allclose(zs[0][0], torch.tensor(self.EXPECTED_OUTPUT_2))
 
         set_seed(0)
         zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
+        torch.testing.assert_allclose(zs[1][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         set_seed(0)
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
+        torch.testing.assert_allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
     def test_slow_sampling(self):
@@ -339,19 +360,19 @@ def test_slow_sampling(self):
         model.priors[0].cuda()
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        torch.testing.assert_allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
         model.priors[0].cpu()
 
         set_seed(0)
         model.priors[1].cuda()
         zs = model._sample(zs, labels, [1], sample_length=60 * model.priors[1].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
+        torch.testing.assert_allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
         model.priors[1].cpu()
 
         set_seed(0)
         model.priors[2].cuda()
         zs = model._sample(zs, labels, [2], sample_length=60 * model.priors[2].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
+        torch.testing.assert_allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
 
     @slow
     def test_fp16_slow_sampling(self):
@@ -361,4 +382,4 @@ def test_fp16_slow_sampling(self):
         set_seed(0)
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=60 * model.priors[0].raw_to_tokens, save_results=False)
-        assert torch.allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
+        torch.testing.assert_allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))

From 53e300c223f2cb1e5158467fe916651000f606eb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 15:56:02 +0000
Subject: [PATCH 158/196] style

---
 .../models/jukebox/modeling_jukebox.py             |  8 ++++----
 tests/models/jukebox/test_modeling_jukebox.py      | 14 ++++++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2b3a751a74e3c..f76d5498d1db9 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -322,7 +322,7 @@ def forward(self, hidden_states):
 
 
 class JukeboxDecoderConvBock(nn.Module):
-    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t, reverse_dilation = True):
+    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t, reverse_dilation=True):
         self.embed_dim = embed_dim
         self.hidden_dim = hidden_dim
         super().__init__()
@@ -1647,7 +1647,7 @@ def __init__(self, config, level):
             config.res_conv_depth,
             config.res_downs_t[level],
             config.res_strides_t[level],
-            reverse_dilation = False
+            reverse_dilation=False,
         )
         self.layer_norm = JukeboxLayerNorm(config.width)
 
@@ -2016,7 +2016,7 @@ def embed_tokens(self, music_tokens_conds):
         """
         Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
         """
-        music_tokens_conds = music_tokens_conds[: self.cond_level+1]
+        music_tokens_conds = music_tokens_conds[: self.cond_level + 1]
         audio_conditioning = None
         for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, [self.conditioner_blocks]))):
             audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning)
@@ -2252,7 +2252,7 @@ def _init_weights(self, module):
         elif isinstance(module, JukeboxConv1D):
             if self.config.zero_out:
                 module.weight.data.zero_()
-            else :
+            else:
                 module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
         elif isinstance(module, JukeboxPositionalEmbedding):
             module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index ff43983fdf10b..d5f27e3083908 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -181,9 +181,15 @@ def test_slow_sampling(self):
         self.assertListEqual(metadata.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
 
         audio_conditioning, metadata_conditioning, lyric_tokens = top_prior.get_cond(music_token_conds, metadata.cpu())
-        torch.testing.assert_allclose(audio_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_AUDIO_COND), atol = 1e-4, rtol= 1e-4)
-        torch.testing.assert_allclose(metadata_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_META_COND), atol = 1e-4, rtol= 1e-4)
-        torch.testing.assert_allclose(lyric_tokens[0,:30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol = 1e-4, rtol= 1e-4)
+        torch.testing.assert_allclose(
+            audio_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_AUDIO_COND), atol=1e-4, rtol=1e-4
+        )
+        torch.testing.assert_allclose(
+            metadata_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_META_COND), atol=1e-4, rtol=1e-4
+        )
+        torch.testing.assert_allclose(
+            lyric_tokens[0, :30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol=1e-4, rtol=1e-4
+        )
 
         set_seed(0)
         top_prior.cuda()
@@ -252,7 +258,7 @@ def test_vqvae(self):
 
         with torch.no_grad():
             x = model.vqvae.decode(zs, start_level=2, bs_chunks=x.shape[0])
-        torch.testing.assert_allclose(x[0, :40, 0], torch.tensor(self.EXPECTED_VQVAE_DECODE), atol=1e-4, rtol= 1e-4)
+        torch.testing.assert_allclose(x[0, :40, 0], torch.tensor(self.EXPECTED_VQVAE_DECODE), atol=1e-4, rtol=1e-4)
 
 
 @require_torch

From 61d1bb2ee87dbb86435e25f13d20efb787089100 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 15:57:28 +0000
Subject: [PATCH 159/196] nits

---
 src/transformers/models/jukebox/modeling_jukebox.py | 2 +-
 tests/models/jukebox/test_modeling_jukebox.py       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index f76d5498d1db9..452250c283e06 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -151,7 +151,7 @@ def get_alignment(music_tokens, labels, prior, config):
         weights = torch.cat(w_hops, dim=0)
         del w_hops
         alignment_hop = weights.float().cpu().numpy()
-        del w
+        del weights
 
         # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens)
         # indices_hop is a list of len=bs, each entry of len hps.nb_relevant_lyric_tokens
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index d5f27e3083908..91919ce20aae9 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -128,8 +128,8 @@ class Jukebox1bModelTester(unittest.TestCase):
         -0.0100, 0.0442, -0.0586, 0.0207, -0.0015, -0.0082
     ]
     EXPECTED_META_COND = [
-         0.0415, 0.0877, 0.0022, -0.0055, 0.0751, 0.0334, 0.0324, -0.0068,
-         0.0011, 0.0017, -0.0676, 0.0655, -0.0143, 0.0399, 0.0303, 0.0743,
+        0.0415, 0.0877, 0.0022, -0.0055, 0.0751, 0.0334, 0.0324, -0.0068,
+        0.0011, 0.0017, -0.0676, 0.0655, -0.0143, 0.0399, 0.0303, 0.0743,
         -0.0168, -0.0394, -0.1113, 0.0124, 0.0442, 0.0267, -0.0003, -0.1536,
         -0.0116, -0.1837, -0.0180, -0.1026, -0.0777, -0.0456
     ]

From 142cd718e8ae1c3188bcd82e2fdac98cc35f32b7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 17:18:54 +0000
Subject: [PATCH 160/196] add prior doc

---
 .../models/jukebox/configuration_jukebox.py   | 361 +++++++++---------
 1 file changed, 184 insertions(+), 177 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 1e97645465557..2710873487741 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -141,109 +141,125 @@ class JukeboxPriorConfig(PretrainedConfig):
 
 
     Args:
-        metadata_dims (`List[Tuple[int, int]]`, *optional*, defaults to `[(604, 7898), (120, 4111), (120, 4111)]`):
-            List containing the number of genres and the number of artists that were used to train the embedding layers
-            of each of the prior models.
-        is_encoder_decoder (`List[bool]`, *optional*, defaults to `[True, False, False]`):
-            Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
-            `encoderoder` for each of the priors.
-        merged_decoder (`list`, *optional*, defaults to [True, False, False]):
-            Whether or not the decoder is merged with the encoder.
-        lyric_conditioning (`list`, *optional*, defaults to [True, False, False]):
-            Whether or not to use the lyrics as conditioning.
-        nb_relevant_lyric_tokens (`list`, *optional*, defaults to [384, 0, 0]):
-            Number of tokens that are used when sampling a single window of length `prior_n_ctx`
-        zero_out (`bool`, *optional*, defaults to False):
-            Zero out weights when initialising.
-        depth (`list`, *optional*, defaults to [3, 16, 16]):
-            Number of layers to use for the music conditioner.
-        width (`list`, *optional*, defaults to [128, 1024, 1024]):
-            Width of the audio conditioning layer.
-        dilation_growth_rate (`list`, *optional*, defaults to [1, 3, 3]):
-            Dilation grow rate used between each convolutionnal block.
-        dilation_cycle (`list`, *optional*, defaults to [None, 8, 8]):
-            Cycle of dilation to use. Usually similar to the ones used in the VQVAE.
-        res_scale (`list`, *optional*, defaults to [None, True, False]):
-            Wheter or not to scale the residuals in the audio conditionner block. Since the top level prior doeas not
+        act_fn (`str`, *optional*, defaults to "quick_gelu"):
+            Activation function.
+        alignment_head (`int`, *optional*, defaults to 2):
+            Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio alignment
+        alignment_layer (`int`, *optional*, defaults to 68):
+            Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio alignment
+        attention_multiplier (`float`, *optional*, defaults to 0.25):
+            Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
+            0.25*width of the model will be used.
+        attention_pattern (`str`, *optional*, defaults to "enc_dec_with_lyrics"):
+            Which attention pattern to use for the decoder/
+        attn_dropout (`int`, *optional*, defaults to 0):
+            Dropout probability for the post-attention layer dropout in the decoder.
+        attn_res_scale (`bool`, *optional*, defaults to False):
+            Wheter or not to scale the residuals in the attention conditionner block.
+        blocks (`int`, *optional*, defaults to 64):
+            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
+        conv_res_scale (`int`, *optional*, defaults to None):
+            Wheter or not to scale the residuals in the conditionner block. Since the top level prior doeas not
             have a conditionner, the default value is to None and should not be modified.
-        convolution_multiplier (`int`, *optional*, defaults to 1):
-            Conditionner multiplier (the input states are mulitplied by that parameter for each convolution.
-        downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
-            Downsampling rates used in the audio conditioning network
-        strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
-            Striding used in the audio conditioning network
-        encoder_spread (`bool`, *optional*, defaults to `False`):
-            Spread used in the attention pattern
-        encoder_width (`list`, *optional*, defaults to [128, 128, 128]):
-            Width of the lyric encoder
-        encoder_depth (`list`, *optional*, defaults to [18, 3, 3]):
-            Number of encoder blocks used in the lyric encoder
-        encoder_heads (`int`, *optional*, defaults to 4):
-            Number of heads in the lyric encoder
+        depth (`int`, *optional*, defaults to 72):
+            Number of layers of the decoder architecture. #TODO replace with num decoder_layers?
+        emb_dropout (`int`, *optional*, defaults to 0):
+            Embedding dropout used in the lyric decoder.
+        embed_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the audio embedings. I can be different with the `width` for smaller models. 
         encoder_attention_multiplier (`float`, *optional*, defaults to 0.25):
             Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
             0.25*width of the model will be used.
-        encoder_mlp_multiplier (`float`, *optional*, defaults to 1.0):
-            Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
-            the model will be used.
-        encoder_blocks (`int`, *optional*, defaults to 32):
-            Sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
-        encoder_init_scale (`list`, *optional*, defaults to [0.1, 0.4, 0.4]):
-            Initialisation scales for the lyric encoder modules.
-        encoder_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
-            Multiplication factor used in front of the lyric encoder loss. Each value is for a particular level.
-        encoder_attention_pattern (`list`, *optional*, defaults to [2, 0, 0]):
+        encoder_attention_pattern (`str`, *optional*, defaults to "RawColumnPreviousRowAttention"):
             Which attention pattern to use for the lyric encoder.
         encoder_attn_dropout (`float`, *optional*, defaults to 0.0):
             Dropout probability for the post-attention layer dropout in the lyric encoder.
-        encoder_resid_dropout (`float`, *optional*, defaults to 0.0):
-            Residual dropout used in the attention pattern of the lyric encoder.
+        encoder_attn_res_scale (`bool`, *optional*, defaults to False):
+            Wheter or not to scale the residuals in the attention conditionner block.
+        encoder_blocks (`int`, *optional*, defaults to 32):
+            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
+        encoder_depth (`int`, *optional*, defaults to 18):
+            Depth of the encoder model.
         encoder_emb_dropout (`float`, *optional*, defaults to 0.0):
             Embedding dropout used in the lyric encoder.
-        encoder_zero_out (`bool`, *optional*, defaults to `False`):
-            Whether or not to set to zeros the weights the MLPs in the lyric encoder.
-        encoder_res_scale (`bool`, *optional*, defaults to `False`):
-            Residual scaling factor used in the lyric encoder attention patterns.
+        encoder_heads (`int`, *optional*, defaults to 4):
+            Number of heads in the lyric encoder
+        encoder_init_scale (`float`, *optional*, defaults to 0.1):
+            Initialisation scales for the lyric encoder modules.
+        encoder_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
+            Multiplication factor used in front of the lyric encoder loss. Each value is for a particular level.
+        encoder_mlp_multiplier (`float`, *optional*, defaults to 1.0):
+            Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
+            the model will be used.
         encoder_n_vocab (`int`, *optional*, defaults to 79):
-            Defines the number of different tokens that can be represented by the `inputs_ids` passed to the
-            `encoderoder`
-        init_scale (`list`, *optional*, defaults to [0.2, 1, 1]):
+            Defines the number of different lyric tokens that can be represented by the `inputs_ids` passed to the
+            `encoder`.
+        encoder_resid_dropout (`float`, *optional*, defaults to 0.0):
+            Residual dropout used in the attention pattern of the lyric encoder.
+        encoder_spread (`int`, *optional*, defaults to None):
+            Spread used in the `summary_spread_attention` pattern
+        encoder_width (`int`, *optional*, defaults to 128):
+            Width of the lyric encoder if `is_encoder_decoder=False` and `nb_relevant_lyric_tokens>0`
+        encoder_zero_out (`bool`, *optional*, defaults to False):
+            Whether or not to set to zeros the weights the convolutions in the lyric encoder.
+        init_scale (`float`, *optional*, defaults to 0.2):
             Initialisation scales for the prior modules.
-        spread (`bool`, *optional*, defaults to False):
-            Spread used in the attention pattern
-        zero_out (`bool`, *optional*, defaults to False):
-             Whether or not to set to zeros the weights the MLPs of the priors.
-        res_scale (`bool`, *optional*, defaults to False):
-            Residual scaling factor used in every prior's attention layer.
-        n_ctx (`tuple`, *optional*, defaults to (6144, 8192, 8192)):
+        is_encoder_decoder (`bool`, *optional*, defaults to True):
+            Whether or not the prior is an encoder-decoder model. In case it is not,
+            and `nb_relevant_lyric_tokens` is greater than 0, the `encoder` args
+            should be specified for the lyric encoding.
+        mask (`bool`, *optional*, defaults to False):
+            Whether or not to mask the previous positions in the attention.
+        max_duration (`int`, *optional*, defaults to 600):
+            _description_
+        max_nb_genres (`int`, *optional*, defaults to 1):
+            _description_
+        merged_decoder (`bool`, *optional*, defaults to True):
+            Whether or not the decoder and the encoder inputs are merged. This is used for the seperated encoder-decoder architecture
+        metadata_conditioning (`bool`, *optional*, defaults to True):
+            _description_
+        metadata_dims (`tuple(int)`, *optional*, defaults to (604, 7898)):
+            Number of genres and the number of artists that were used to train the embedding layers
+            of the prior models.
+        min_duration (`int`, *optional*, defaults to 0):
+            _description_
+        mlp_multiplier (`float`, *optional*, defaults to 1.0):
+            Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
+            the model will be used.
+        n_ctx (`int`, *optional*, defaults to 6144):
             Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
             generating music tokens.
-        latent_dim (`int`, *optional*, defaults to 2048):
-            Dimension of the latent music token space. Default value match the `vqvae_codebook_dimension`.
-        width (`list`, *optional*, defaults to [2048, 1920, 1920]):
-            Input and output dimension of the attention layers of each prior.
-        attention_multiplier (`float`, *optional*, defaults to 0.25):
-            Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
-            0.25*width of the model will be used.
-        depth (`list`, *optional*, defaults to [72, 72, 72]):
-            Depth of each prior. Defines the number of `attn_block`.
-        n_heads (`list`, *optional*, defaults to [2, 1, 1]):
-            Number of attention heads per prior.
-        attention_pattern (`list`, *optional*, defaults to [12, 2, 2]):
-            Attention patterns to use in each prior. Depending on the value, cross attention, block attention and
-            sparse attention blocks are stacked.
-        blocks (`int`, *optional*, defaults to 64):
-            Sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
-        alignment_layer (`list`, *optional*, defaults to [68, None, None]):
-            Layer corresponding to the alignemnt between the lyrics and the audio.
-        alignment_head (`list`, *optional*, defaults to [2, None, None]):
-            Index of the attention head which takes care of the alignemnt between the lyrics and the audio.
-        attn_dropout (`int`, *optional*, defaults to 0):
-            Dropout probability for the post-attention layer dropout of the prior models.
+        n_heads (`int`, *optional*, defaults to 2):
+             Number of attention heads.
+        nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
+            Number of lyric tokens that are used when sampling a single window of length `prior_n_ctx`
+        res_conv_depth (`int`, *optional*, defaults to 3):
+            Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the `JukeboxMusicTokenConditioner`.
+        res_conv_width (`int`, *optional*, defaults to 128):
+            Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the `JukeboxMusicTokenConditioner`.
+        res_convolution_multiplier (`int`, *optional*, defaults to 1):
+            Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
+        res_dilation_cycle (`int`, *optional*, defaults to None):
+            Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the corresponding level of the VQVAE.
+            The first prior does not use it as it is not conditioned on upper level tokens.
+        res_dilation_growth_rate (`int`, *optional*, defaults to 1):
+            Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
+        res_downs_t (`tuple(int)`, *optional*, defaults to (3, 2, 2)):
+            Downsampling rates used in the audio conditioning network
+        res_strides_t (`tuple(int)`, *optional*, defaults to (2, 2, 2)):
+            Striding used in the audio conditioning network
         resid_dropout (`int`, *optional*, defaults to 0):
-            Residual dropout probability used in the attention layers of the prior models.
-        emb_dropout (`int`, *optional*, defaults to 0):
-            Dropout applied to the embedding layer of the priors.
+            Residual dropout used in the attention pattern.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            _description_
+        spread (`int`, *optional*, defaults to None):
+            Spread used in the `summary_spread_attention` pattern
+        timing_dims (`int`, *optional*, defaults to 64):
+            _description_
+        width (`int`, *optional*, defaults to 2048):
+           Dimension of the attention layers. # TODO this is a bit confusing
+        zero_out (`bool`, *optional*, defaults to False):
+            Whether or not to zero out convolution weights when initialising.
     """
 
     model_type = "jukebox"
@@ -255,128 +271,119 @@ class JukeboxPriorConfig(PretrainedConfig):
 
     def __init__(
         self,
-        sampling_rate=44100,
-        timing_dims=64,
-        min_duration=0,
-        max_duration=600,
-        max_nb_genres=1,
-        metadata_conditioning=True,
-        zero_out=False,
-        res_conv_depth=3,
-        res_conv_width=128,
-        res_dilation_growth_rate=1,
-        res_dilation_cycle=None,
+        act_fn="quick_gelu",
+        alignment_head=2,
+        alignment_layer=68,
+        attention_multiplier=0.25,
+        attention_pattern="enc_dec_with_lyrics",
+        attn_dropout=0,
+        attn_res_scale=False,
+        blocks=64,
         conv_res_scale=None,
-        res_convolution_multiplier=1,
-        res_downs_t=(3, 2, 2),
-        res_strides_t=(2, 2, 2),
-        encoder_spread=None,
-        encoder_width=128,
-        encoder_depth=18,
-        encoder_heads=4,
+        depth=72,
+        emb_dropout=0,
+        embed_dim=2048,
         encoder_attention_multiplier=0.25,
-        encoder_mlp_multiplier=1.0,
+        encoder_attention_pattern="RawColumnPreviousRowAttention",
+        encoder_attn_dropout=0.0,
         encoder_blocks=32,
+        encoder_depth=18,
+        encoder_emb_dropout=0.0,
+        encoder_heads=4,
         encoder_init_scale=0.1,
         encoder_loss_fraction=[0.4, 0.0, 0.0],
-        encoder_attention_pattern="RawColumnPreviousRowAttention",
-        encoder_attn_dropout=0.0,
+        encoder_mlp_multiplier=1.0,
+        encoder_n_vocab=79,
+        encoder_attn_res_scale=False,
         encoder_resid_dropout=0.0,
-        encoder_emb_dropout=0.0,
+        encoder_spread=None,
+        encoder_width=128,
         encoder_zero_out=False,
-        encoder_res_scale=False,
-        encoder_n_vocab=79,
         init_scale=0.2,
-        attn_res_scale=False,
-        n_ctx=6144,
-        width=2048,
-        depth=72,
-        n_heads=2,
-        attention_pattern="enc_dec_with_lyrics",
-        alignment_layer=68,
-        alignment_head=2,
-        metadata_dims=(604, 7898),
         is_encoder_decoder=True,
-        merged_decoder=True,
         lyric_conditioning=True,
-        nb_relevant_lyric_tokens=384,
-        embed_dim=2048,
-        spread=None,
-        blocks=64,
-        attention_multiplier=0.25,
+        mask=False,
+        max_duration=600,
+        max_nb_genres=1,
+        merged_decoder=True,
+        metadata_conditioning=True,
+        metadata_dims=(604, 7898),
+        min_duration=0,
         mlp_multiplier=1.0,
-        attn_dropout=0,
+        n_ctx=6144,
+        n_heads=2,
+        nb_relevant_lyric_tokens=384,
+        res_conv_depth=3,
+        res_conv_width=128,
+        res_convolution_multiplier=1,
+        res_dilation_cycle=None,
+        res_dilation_growth_rate=1,
+        res_downs_t=(3, 2, 2),
+        res_strides_t=(2, 2, 2),
         resid_dropout=0,
-        emb_dropout=0,
-        mask=False,
-        act_fn="quick_gelu",
+        sampling_rate=44100,
+        spread=None,
+        timing_dims=64,
+        width=2048,
+        zero_out=False,
         **kwargs
     ):
-        self.metadata_dims = metadata_dims
-        self.res_conv_depth = res_conv_depth
-        self.res_conv_width = res_conv_width
-        #  Auto regressive (decoder) kwargs :
+
+        self.act_fn = act_fn
+        self.alignment_head = alignment_head
+        self.alignment_layer = alignment_layer
+        self.attention_multiplier = attention_multiplier
         self.attention_pattern = attention_pattern
-        self.n_heads = n_heads
-        self.depth = depth
-        self.width = width
-        self.n_ctx = n_ctx
-        self.embed_dim = embed_dim
         self.attn_dropout = attn_dropout
-        self.resid_dropout = resid_dropout
-        self.emb_dropout = emb_dropout
-        self.zero_out = zero_out
-        self.conv_res_scale = conv_res_scale
+        self.attn_res_scale = attn_res_scale
         self.blocks = blocks
-        self.attention_multiplier = attention_multiplier
-        self.mlp_multiplier = mlp_multiplier
-        self.spread = spread
-        self.alignment_layer = alignment_layer
-        self.alignment_head = alignment_head
-        self.init_scale = init_scale
-
-        # Audio conditioning : upsampler parameters
+        self.conv_res_scale = conv_res_scale
         self.depth = depth
-        self.width = width
-        self.res_dilation_growth_rate = res_dilation_growth_rate
-        self.res_dilation_cycle = res_dilation_cycle
-        self.zero_out = zero_out
-        self.res_convolution_multiplier = res_convolution_multiplier
-        self.attn_res_scale = attn_res_scale
-        self.res_downs_t = res_downs_t
-        self.res_strides_t = res_strides_t
-
-        # Lyric conditioning
-        self.merged_decoder = merged_decoder  # is this equivalent ?
-        self.is_encoder_decoder = is_encoder_decoder
-        self.lyric_conditioning = lyric_conditioning
-        self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
-
-        self.encoder_attn_dropout = encoder_attn_dropout
+        self.emb_dropout = emb_dropout
+        self.embed_dim = embed_dim
+        self.encoder_attention_multiplier = encoder_attention_multiplier
         self.encoder_attention_pattern = encoder_attention_pattern
+        self.encoder_attn_dropout = encoder_attn_dropout
+        self.encoder_attn_res_scale = encoder_attn_res_scale
         self.encoder_blocks = encoder_blocks
         self.encoder_depth = encoder_depth
         self.encoder_emb_dropout = encoder_emb_dropout
         self.encoder_heads = encoder_heads
         self.encoder_init_scale = encoder_init_scale
         self.encoder_loss_fraction = encoder_loss_fraction
-        self.encoder_attention_multiplier = encoder_attention_multiplier
         self.encoder_mlp_multiplier = encoder_mlp_multiplier
+        self.encoder_n_vocab = encoder_n_vocab
         self.encoder_resid_dropout = encoder_resid_dropout
-        self.encoder_res_scale = encoder_res_scale
         self.encoder_spread = encoder_spread
         self.encoder_width = encoder_width
         self.encoder_zero_out = encoder_zero_out
-        self.encoder_n_vocab = encoder_n_vocab
+        self.init_scale = init_scale
+        self.is_encoder_decoder = is_encoder_decoder
+        self.lyric_conditioning = lyric_conditioning
         self.mask = mask
-        self.act_fn = act_fn
-
-        self.sampling_rate = sampling_rate
-        self.timing_dims = timing_dims
-        self.min_duration = min_duration
         self.max_duration = max_duration
         self.max_nb_genres = max_nb_genres
+        self.merged_decoder = merged_decoder
         self.metadata_conditioning = metadata_conditioning
+        self.metadata_dims = metadata_dims
+        self.min_duration = min_duration
+        self.mlp_multiplier = mlp_multiplier
+        self.n_ctx = n_ctx
+        self.n_heads = n_heads
+        self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
+        self.res_conv_depth = res_conv_depth
+        self.res_conv_width = res_conv_width
+        self.res_convolution_multiplier = res_convolution_multiplier
+        self.res_dilation_cycle = res_dilation_cycle
+        self.res_dilation_growth_rate = res_dilation_growth_rate
+        self.res_downs_t = res_downs_t
+        self.res_strides_t = res_strides_t
+        self.resid_dropout = resid_dropout
+        self.sampling_rate = sampling_rate
+        self.spread = spread
+        self.timing_dims = timing_dims
+        self.width = width
+        self.zero_out = zero_out
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":

From 099b24419d5fca0182ff698c56affb82c4f42240 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 17:31:30 +0000
Subject: [PATCH 161/196] add vqvae docstring

---
 .../models/jukebox/configuration_jukebox.py   | 101 ++++++++++--------
 1 file changed, 58 insertions(+), 43 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 2710873487741..303e22dbe3d11 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -131,9 +131,10 @@
 class JukeboxPriorConfig(PretrainedConfig):
     """
     This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
-    `JukeboxPriorl` according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the top level prior fro the
-    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/ukebox-1b-lyrics) architecture.
+    `JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the top level prior from the
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
+-1b-lyrics) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -405,67 +406,81 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
 class JukeboxVQVAEConfig(PretrainedConfig):
     """
-        hop_fraction (`list`, *optional*, defaults to [0.125, 0.5, 0.5]):
+    This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
+    `JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the VQVAE from
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
+-1b-lyrics) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+        act_fn (`str`, *optional*, defaults to "relu"):
+            _description_
+        codebook_dimension (`int`, *optional*, defaults to 2048):
+            Number of codes to use in each of the VQVAE.
+        commit (`float`, *optional*, defaults to 0.02):
+            Commit loss multiplier.
+        conv_input_shape (`int`, *optional*, defaults to 1):
+            Number of audio channels.
+        conv_res_scale (`bool`, *optional*, defaults to False):
+            _description_
+        embed_dim (`int`, *optional*, defaults to 64):
+            Embeding dimension of the codebook vectors.
+        hop_fraction (`List[`int`]`, *optional*, defaults to [0.125, 0.5, 0.5]):
             Fraction of non-intersecting window used when continuing the sampling process.
-        input_channels:
-            number of audio channels
-        sample_length:
-            on which the VQVAE was trained. Provides the max output shape of the VQVAE
         levels (`int`, *optional*, defaults to 3):
             Number of hierachical levels that used in the VQVAE.
-        downs_t (`tuple`, *optional*, defaults to (3, 2, 2)):
-            Downsampling rate for each level of the hierachical VQ-VAE.
-        strides_t (`tuple`, *optional*, defaults to (2, 2, 2)):
-            Stride used for each level of the hierachical VQ-VAE.
-        embed_dim (`int`, *optional*, defaults to 64):
-            Dimension of the codebook vectors.
-        codebook_dimension (`int`, *optional*, defaults to 2048):
-            Number of codes to use in each of the VQVAE.
-        convolution_multiplier (`int`, *optional*, defaults to 1):
-            Projection factor used in the `JukeboxResConv1DBlock`.
-        dilation_growth_rate (`int`, *optional*, defaults to 3):
-            Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
-        dilation_cycle (`int`, *optional*, defaults to None):
-            Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
-            of reduced by a power of `dilation_cycle`.
-        multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
-            Depth and width multipliers used for each level. Used on the `conv_block_width` and `conv_block_depth`
         lmu (`float`, *optional*, defaults to 0.99):
             Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
             of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
-        commit (`float`, *optional*, defaults to 0.02):
-            Commit loss multiplier.
-        conv_block_depth (`int`, *optional*, defaults to 4):
+        multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
+            Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
+        res_conv_depth (`int`, *optional*, defaults to 4):
             Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
-        conv_block_width (`int`, *optional*, defaults to 32):
+        res_conv_width (`int`, *optional*, defaults to 32):
             Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
-        reverse_decoder_dilation (`int`, *optional*, defaults to 1):
-            Whether or not to reverse the dilation rate for the decoder.
-    Example:
+        res_convolution_multiplier (`int`, *optional*, defaults to 1):
+            Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
+        res_dilation_cycle (`_type_`, *optional*, defaults to None):
+             Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
+            of reduced by a power of `res_dilation_cycle`.
+        res_dilation_growth_rate (`int`, *optional*, defaults to 3):
+            Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
+        res_downs_t (`tuple(int)`, *optional*, defaults to (3, 2, 2)):
+             Downsampling rate for each level of the hierachical VQ-VAE.
+        res_strides_t (`tuple(int)`, *optional*, defaults to (2, 2, 2)):
+            Stride used for each level of the hierachical VQ-VAE.
+        sample_length (`int`, *optional*, defaults to 1058304):
+            Provides the max input shape of the VQVAE. Is used to compute the input shape of
+            each level.
     """
 
     def __init__(
         self,
-        hop_fraction=[0.125, 0.5, 0.5],
-        sample_length=1058304,
-        levels=3,
-        embed_dim=64,
+        act_fn="relu",
         codebook_dimension=2048,
-        lmu=0.99,
         commit=0.02,
         conv_input_shape=1,
-        res_downs_t=(3, 2, 2),
-        res_strides_t=(2, 2, 2),
+        conv_res_scale=False,
+        embed_dim=64,
+        hop_fraction=[0.125, 0.5, 0.5],
+        levels=3,
+        lmu=0.99,
         multipliers=(2, 1, 1),
-        res_conv_width=32,
         res_conv_depth=4,
+        res_conv_width=32,
         res_convolution_multiplier=1,
-        res_dilation_growth_rate=3,
         res_dilation_cycle=None,
-        conv_res_scale=False,
-        act_fn="relu",
+        res_dilation_growth_rate=3,
+        res_downs_t=(3, 2, 2),
+        res_strides_t=(2, 2, 2),
+        sample_length=1058304,
         **kwargs
     ):
+
+
+
         self.hop_fraction = hop_fraction
         self.conv_input_shape = conv_input_shape
         self.sample_length = sample_length

From 5c5bce8bef18590e51f61640eb6fb85104a9b239 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 17:38:12 +0000
Subject: [PATCH 162/196] add prior to init and models

---
 src/transformers/__init__.py                  |  9 ++-
 .../models/jukebox/configuration_jukebox.py   | 77 ++++++++++---------
 .../models/jukebox/modeling_jukebox.py        | 10 +--
 src/transformers/utils/dummy_pt_objects.py    |  7 ++
 4 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ca49dedd4052a..0f9a286998291 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1438,7 +1438,13 @@
         ]
     )
     _import_structure["models.jukebox"].extend(
-        ["JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST", "JukeboxModel", "JukeboxPreTrainedModel", "JukeboxVQVAE"]
+        [
+            "JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "JukeboxModel",
+            "JukeboxPreTrainedModel",
+            "JukeboxVQVAE",
+            "JukeboxPrior",
+        ]
     )
     _import_structure["models.layoutlm"].extend(
         [
@@ -4289,6 +4295,7 @@
             JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             JukeboxModel,
             JukeboxPreTrainedModel,
+            JukeboxPrior,
             JukeboxVQVAE,
         )
         from .models.layoutlm import (
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 303e22dbe3d11..cc4dde150f1fd 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -130,14 +130,14 @@
 
 class JukeboxPriorConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
-    `JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the top level prior from the
-    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
--1b-lyrics) architecture.
+        This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
+        `JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
+        configuration with the defaults will yield a similar configuration to that of the top level prior from the
+        [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
+    -1b-lyrics) architecture.
 
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
+        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+        documentation from [`PretrainedConfig`] for more information.
 
 
 
@@ -145,9 +145,11 @@ class JukeboxPriorConfig(PretrainedConfig):
         act_fn (`str`, *optional*, defaults to "quick_gelu"):
             Activation function.
         alignment_head (`int`, *optional*, defaults to 2):
-            Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio alignment
+            Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
+            alignment
         alignment_layer (`int`, *optional*, defaults to 68):
-            Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio alignment
+            Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the
+            lyric to audio alignment
         attention_multiplier (`float`, *optional*, defaults to 0.25):
             Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
             0.25*width of the model will be used.
@@ -158,16 +160,17 @@ class JukeboxPriorConfig(PretrainedConfig):
         attn_res_scale (`bool`, *optional*, defaults to False):
             Wheter or not to scale the residuals in the attention conditionner block.
         blocks (`int`, *optional*, defaults to 64):
-            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
+            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len //
+            blocks] in the `JukeboxAttention` layer.
         conv_res_scale (`int`, *optional*, defaults to None):
-            Wheter or not to scale the residuals in the conditionner block. Since the top level prior doeas not
-            have a conditionner, the default value is to None and should not be modified.
+            Wheter or not to scale the residuals in the conditionner block. Since the top level prior doeas not have a
+            conditionner, the default value is to None and should not be modified.
         depth (`int`, *optional*, defaults to 72):
             Number of layers of the decoder architecture. #TODO replace with num decoder_layers?
         emb_dropout (`int`, *optional*, defaults to 0):
             Embedding dropout used in the lyric decoder.
         embed_dim (`int`, *optional*, defaults to 2048):
-            Dimension of the audio embedings. I can be different with the `width` for smaller models. 
+            Dimension of the audio embedings. I can be different with the `width` for smaller models.
         encoder_attention_multiplier (`float`, *optional*, defaults to 0.25):
             Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
             0.25*width of the model will be used.
@@ -178,7 +181,8 @@ class JukeboxPriorConfig(PretrainedConfig):
         encoder_attn_res_scale (`bool`, *optional*, defaults to False):
             Wheter or not to scale the residuals in the attention conditionner block.
         encoder_blocks (`int`, *optional*, defaults to 32):
-            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len // blocks] in the `JukeboxAttention` layer.
+            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len //
+            blocks] in the `JukeboxAttention` layer.
         encoder_depth (`int`, *optional*, defaults to 18):
             Depth of the encoder model.
         encoder_emb_dropout (`float`, *optional*, defaults to 0.0):
@@ -206,9 +210,8 @@ class JukeboxPriorConfig(PretrainedConfig):
         init_scale (`float`, *optional*, defaults to 0.2):
             Initialisation scales for the prior modules.
         is_encoder_decoder (`bool`, *optional*, defaults to True):
-            Whether or not the prior is an encoder-decoder model. In case it is not,
-            and `nb_relevant_lyric_tokens` is greater than 0, the `encoder` args
-            should be specified for the lyric encoding.
+            Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
+            greater than 0, the `encoder` args should be specified for the lyric encoding.
         mask (`bool`, *optional*, defaults to False):
             Whether or not to mask the previous positions in the attention.
         max_duration (`int`, *optional*, defaults to 600):
@@ -216,12 +219,13 @@ class JukeboxPriorConfig(PretrainedConfig):
         max_nb_genres (`int`, *optional*, defaults to 1):
             _description_
         merged_decoder (`bool`, *optional*, defaults to True):
-            Whether or not the decoder and the encoder inputs are merged. This is used for the seperated encoder-decoder architecture
+            Whether or not the decoder and the encoder inputs are merged. This is used for the seperated
+            encoder-decoder architecture
         metadata_conditioning (`bool`, *optional*, defaults to True):
             _description_
         metadata_dims (`tuple(int)`, *optional*, defaults to (604, 7898)):
-            Number of genres and the number of artists that were used to train the embedding layers
-            of the prior models.
+            Number of genres and the number of artists that were used to train the embedding layers of the prior
+            models.
         min_duration (`int`, *optional*, defaults to 0):
             _description_
         mlp_multiplier (`float`, *optional*, defaults to 1.0):
@@ -231,18 +235,21 @@ class JukeboxPriorConfig(PretrainedConfig):
             Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
             generating music tokens.
         n_heads (`int`, *optional*, defaults to 2):
-             Number of attention heads.
+                Number of attention heads.
         nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
             Number of lyric tokens that are used when sampling a single window of length `prior_n_ctx`
         res_conv_depth (`int`, *optional*, defaults to 3):
-            Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the `JukeboxMusicTokenConditioner`.
+            Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
+            `JukeboxMusicTokenConditioner`.
         res_conv_width (`int`, *optional*, defaults to 128):
-            Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the `JukeboxMusicTokenConditioner`.
+            Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
+            `JukeboxMusicTokenConditioner`.
         res_convolution_multiplier (`int`, *optional*, defaults to 1):
             Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
         res_dilation_cycle (`int`, *optional*, defaults to None):
-            Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the corresponding level of the VQVAE.
-            The first prior does not use it as it is not conditioned on upper level tokens.
+            Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the
+            corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level
+            tokens.
         res_dilation_growth_rate (`int`, *optional*, defaults to 1):
             Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
         res_downs_t (`tuple(int)`, *optional*, defaults to (3, 2, 2)):
@@ -258,7 +265,7 @@ class JukeboxPriorConfig(PretrainedConfig):
         timing_dims (`int`, *optional*, defaults to 64):
             _description_
         width (`int`, *optional*, defaults to 2048):
-           Dimension of the attention layers. # TODO this is a bit confusing
+            Dimension of the attention layers. # TODO this is a bit confusing
         zero_out (`bool`, *optional*, defaults to False):
             Whether or not to zero out convolution weights when initialising.
     """
@@ -409,12 +416,12 @@ class JukeboxVQVAEConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
     `JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the VQVAE from
-    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
--1b-lyrics) architecture.
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
+    Args:
         act_fn (`str`, *optional*, defaults to "relu"):
             _description_
         codebook_dimension (`int`, *optional*, defaults to 2048):
@@ -432,8 +439,8 @@ class JukeboxVQVAEConfig(PretrainedConfig):
         levels (`int`, *optional*, defaults to 3):
             Number of hierachical levels that used in the VQVAE.
         lmu (`float`, *optional*, defaults to 0.99):
-            Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
-            of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
+            Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix
+            A.1 of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
         multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
             Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
         res_conv_depth (`int`, *optional*, defaults to 4):
@@ -443,17 +450,17 @@ class JukeboxVQVAEConfig(PretrainedConfig):
         res_convolution_multiplier (`int`, *optional*, defaults to 1):
             Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
         res_dilation_cycle (`_type_`, *optional*, defaults to None):
-             Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
+                Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have
+                a depth
             of reduced by a power of `res_dilation_cycle`.
         res_dilation_growth_rate (`int`, *optional*, defaults to 3):
             Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
         res_downs_t (`tuple(int)`, *optional*, defaults to (3, 2, 2)):
-             Downsampling rate for each level of the hierachical VQ-VAE.
+                Downsampling rate for each level of the hierachical VQ-VAE.
         res_strides_t (`tuple(int)`, *optional*, defaults to (2, 2, 2)):
             Stride used for each level of the hierachical VQ-VAE.
         sample_length (`int`, *optional*, defaults to 1058304):
-            Provides the max input shape of the VQVAE. Is used to compute the input shape of
-            each level.
+            Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
     """
 
     def __init__(
@@ -479,8 +486,6 @@ def __init__(
         **kwargs
     ):
 
-
-
         self.hop_fraction = hop_fraction
         self.conv_input_shape = conv_input_shape
         self.sample_length = sample_length
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 452250c283e06..16ed8907a35a4 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1910,7 +1910,7 @@ def _get_encoder_config(self, config):
         encoder_config.attention_multiplier = config.encoder_attention_multiplier
         encoder_config.mlp_multiplier = config.encoder_mlp_multiplier
         encoder_config.resid_dropout = config.encoder_resid_dropout
-        encoder_config.res_scale = config.encoder_res_scale
+        encoder_config.attn_res_scale = config.encoder_attn_res_scale
         encoder_config.spread = config.encoder_spread
         encoder_config.width = config.encoder_width
         encoder_config.zero_out = config.encoder_zero_out
@@ -1977,8 +1977,8 @@ def get_music_tokens_conds(self, music_tokens, start, end):
 
     def prior_preprocess(self, tokens, conds):
         """
-        Shifts the input tokens to account for the dictionnary merge. The embed_dim_shift give by how much the
-        music tokens should be shifted by. It is equal to encoder_n_vocab.
+        Shifts the input tokens to account for the dictionnary merge. The embed_dim_shift give by how much the music
+        tokens should be shifted by. It is equal to encoder_n_vocab.
         """
         batch_size = tokens[0].shape[0]
         for i in range(len(tokens)):
@@ -2079,8 +2079,8 @@ def sample(
         sample_tokens=None,
     ):
         """
-        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas.
-        music_tokens : previously sampled music tokens that are attended to by the prior.
+        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas. music_tokens :
+        previously sampled music tokens that are attended to by the prior.
         """
         no_past_context = music_tokens is None or music_tokens.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 9bfea86810883..782e0234f6358 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2683,6 +2683,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class JukeboxPrior(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class JukeboxVQVAE(metaclass=DummyObject):
     _backends = ["torch"]
 

From 86ba8f5abcb12db03e7f59e2952ae5f9bb4d6b44 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 17:44:59 +0000
Subject: [PATCH 163/196] update JukeboxConfig

---
 .../models/jukebox/configuration_jukebox.py   | 31 +++++++++----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index cc4dde150f1fd..f86cb9974abc8 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -540,37 +540,34 @@ class JukeboxConfig(PretrainedConfig):
     to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
 
     Args:
-        sampling_rate (`int`, *optional*, defaults to 44100):
-            Sampling rate of the raw audio.
+        vqvae_config (`JukeboxVQVAEConfig`, *optional*, defaults to None):
+            _description_
+        prior_config_list (`List[`JukeboxPriorConfig`]`, *optional*, defaults to None):
+            _description_
         nb_priors (`int`, *optional*, defaults to 3):
             Number of prior models that will sequentialy sample tokens. Each prior is conditional auto regressive
             (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
             trained using a top prior and 2 upsampler priors.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            Sampling rate of the raw audio.
         timing_dims (`int`, *optional*, defaults to 64):
             Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
             layer. The timing embedding layer converts the absolute and relative position in the currently sampled
             audio to a tensor of lenght `timing_dims` that will be added to the music tokens.
-        metadata_conditioning (`bool`, *optional*, defaults to `True`):
-            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
-            duration.
-        is_encoder_decoder (`List[bool]`, *optional*, defaults to `[True, False, False]`):
-            Whether or not to use a single encoder-decoder architecture or split both modules and have a seperate
-            `encoderoder` for each of the priors.
-        merged_decoder (`list`, *optional*, defaults to [True, False, False]):
-            Whether or not the encoders are merged. This means that the input of.
-        lyric_conditioning (`list`, *optional*, defaults to [True, False, False]):
-            Whether or not to use the lyrics as conditioning.
-        nb_relevant_lyric_tokens (`list`, *optional*, defaults to [384, 0, 0]):
-            Number of tokens that are used when sampling a single window of length `n_ctx`
-        min_duration (`float`, *optional*, defaults to 17.84):
+        min_duration (`int`, *optional*, defaults to 0):
             Minimum duration of the audios to generate
         max_duration (`float`, *optional*, defaults to 600.0):
             Maximum duration of the audios to generate
         max_nb_genres (`int`, *optional*, defaults to 5):
             Maximum number of genres that can be used to condition a single sample.
+        metadata_conditioning (`bool`, *optional*, defaults to True):
+            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
+            duration.
         init_std (`float`, *optional*, defaults to 0.2):
             Standard deviation used to inital the model.
 
+    Example:
+
     ```python
     >>> from transformers import JukeboxModel, JukeboxConfig
 
@@ -645,11 +642,11 @@ def __init__(
     @classmethod
     def from_configs(cls, prior_configs: List[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs):
         r"""
-        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model
         configuration.
 
         Returns:
-            [`CLIPConfig`]: An instance of a configuration object
+            [`JukeboxConfig`]: An instance of a configuration object
         """
         prior_config_list = [config.to_dict() for config in prior_configs]
         return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs)

From 43b71b884faca8a015983fd22a440aa3966d3cfe Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 17:46:04 +0000
Subject: [PATCH 164/196] format

---
 .../models/jukebox/configuration_jukebox.py          | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index f86cb9974abc8..73c584ebb2af3 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -414,8 +414,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 class JukeboxVQVAEConfig(PretrainedConfig):
     """
     This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
-    `JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the VQVAE from
+    `JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VQVAE from
     [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
@@ -439,8 +439,8 @@ class JukeboxVQVAEConfig(PretrainedConfig):
         levels (`int`, *optional*, defaults to 3):
             Number of hierachical levels that used in the VQVAE.
         lmu (`float`, *optional*, defaults to 0.99):
-            Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix
-            A.1 of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
+            Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
+            of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
         multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
             Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
         res_conv_depth (`int`, *optional*, defaults to 4):
@@ -450,8 +450,8 @@ class JukeboxVQVAEConfig(PretrainedConfig):
         res_convolution_multiplier (`int`, *optional*, defaults to 1):
             Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
         res_dilation_cycle (`_type_`, *optional*, defaults to None):
-                Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have
-                a depth
+                Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a
+                depth
             of reduced by a power of `res_dilation_cycle`.
         res_dilation_growth_rate (`int`, *optional*, defaults to 3):
             Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)

From b17f841ec021e282ccb999615ca8628c7a442b74 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 17:46:51 +0000
Subject: [PATCH 165/196] JukeboxPrior is tested

---
 utils/check_repo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index 40ae99786fbe4..3cfc9618baea9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -51,6 +51,7 @@
     "TimeSeriesTransformerEncoder",  # Building part of bigger (tested) model.
     "TimeSeriesTransformerDecoder",  # Building part of bigger (tested) model.
     "JukeboxVQVAE",  # Building part of bigger (tested) model.
+    "JukeboxPrior",  # Building part of bigger (tested) model.
     "DeformableDetrEncoder",  # Building part of bigger (tested) model.
     "DeformableDetrDecoder",  # Building part of bigger (tested) model.
     "OPTDecoder",  # Building part of bigger (tested) model.

From 1fbb470995eb607036778e78f0d7d0684dc469fb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 17:52:32 +0000
Subject: [PATCH 166/196] document JukeboxPrior, ignore non auto configured

---
 docs/source/en/model_doc/jukebox.mdx | 7 +++++++
 utils/check_repo.py                  | 1 +
 2 files changed, 8 insertions(+)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index a7b7e51ce9a76..ca677ea972d0f 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -48,6 +48,13 @@ The original code can be found [here](https://github.com/openai/jukebox).
 
 ## JukeboxModel
 
+[[autodoc]] JukeboxPrior
+    - sample
+    - primed_sample
+    - forward
+
+## JukeboxModel
+
 [[autodoc]] JukeboxModel
     - ancestral_sample
     - primed_sample
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 3cfc9618baea9..6c974467727c8 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -145,6 +145,7 @@
     "EsmForProteinFolding",
     "TimeSeriesTransformerForPrediction",
     "JukeboxVQVAE",
+    "JukeboxPrior",
     "PegasusXEncoder",
     "PegasusXDecoder",
     "PegasusXDecoderWrapper",

From 8f30e8df0b4b11d99067b75f9cf543e57d2c29cf Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 18:00:31 +0000
Subject: [PATCH 167/196] cleaning up

---
 .../models/jukebox/convert_jukebox.py         |  23 +-
 .../models/jukebox/sample_original_jukebox.py | 230 ------------------
 2 files changed, 1 insertion(+), 252 deletions(-)
 delete mode 100644 src/transformers/models/jukebox/sample_original_jukebox.py

diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index 1921d628e71dd..c8d0831e53f3d 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -223,28 +223,7 @@ def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None):
 
     model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]]
 
-    # config = JukeboxConfig.from_pretrained("openai/" + model_name)
-    # to convert the 5b lyric token model, use : or "openai/jukebox-5b-lyrics"
-    # config = JukeboxConfig(
-    #     timing_dims=128
-    #     prior_attention_pattern=[10, 2, 2],
-    #     prior_blocks=128,
-    #     prime_n_vocab=80,
-    #     nb_relevant_lyric_tokens=[512, 0, 0],
-    #     prior_n_heads=[8, 1, 1],
-    #     prior_n_ctx=[8192, 8192, 8192],
-    #     prime_width=[1280, 128, 128],
-    #     prior_width=[4800, 1920, 1920],
-    #     is_encoder_decoder=[False, False, False],
-    #     timing_dims=128,
-    #     vqvae_width=64,
-    #     metadata_dims=[(120, 4111), (120, 4111), (120, 4111)],
-    #     min_duration=23.8,
-    #     sample_length= 1058304,
-    #     prior_depth=[79, 72, 72],
-    #     max_nb_genres=1,
-    # )
-    config = JukeboxConfig.from_pretrained("ArthurZ/new-5b-lyrics")
+    config = JukeboxConfig.from_pretrained(model_name)
     model = JukeboxModel(config)
 
     weight_dict = []
diff --git a/src/transformers/models/jukebox/sample_original_jukebox.py b/src/transformers/models/jukebox/sample_original_jukebox.py
deleted file mode 100644
index c2727e188a30a..0000000000000
--- a/src/transformers/models/jukebox/sample_original_jukebox.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The OpenAI Team Authors the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# in order to be used, the following git repo has to be used :
-# git clone --branch adaptive_device https://github.com/ArthurZucker/jukebox.git
-
-import os
-import random
-
-import numpy as np
-import torch
-import torch as t
-
-from jukebox.hparams import HPARAMS_REGISTRY, Hyperparams, setup_hparams
-from jukebox.make_models import MODELS, make_prior, make_vqvae
-from jukebox.sample import _sample
-from jukebox.utils.dist_utils import setup_dist_from_mpi
-from jukebox.utils.torch_utils import empty_cache
-
-
-rank, local_rank, device = setup_dist_from_mpi()
-
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.enabled = False
-
-
-def log_zs(zs, level, model, save_dir="logits"):
-    os.makedirs(save_dir, exist_ok=True)
-    with open(f"{save_dir}/{model}_{level}.txt", "w") as file:
-        file.write(str(zs[level][0].cpu()))
-
-
-def get_args(model):
-    sampling_temperature = 0.98
-    lower_batch_size = 16
-    max_batch_size = 1 if model == "5b_lyrics" else 16
-    lower_level_chunk_size = 32
-    chunk_size = 16 if model == "5b_lyrics" else 32
-    sampling_kwargs = [
-        dict(
-            temp=0.99,
-            fp16=False,
-            max_batch_size=lower_batch_size,
-            chunk_size=lower_level_chunk_size,
-            sample_tokens=10,
-        ),
-        dict(
-            temp=0.99,
-            fp16=False,
-            max_batch_size=lower_batch_size,
-            chunk_size=lower_level_chunk_size,
-            sample_tokens=10,
-        ),
-        dict(
-            temp=sampling_temperature,
-            fp16=False,
-            max_batch_size=max_batch_size,
-            chunk_size=chunk_size,
-            sample_tokens=10,
-        ),
-    ]
-    return sampling_kwargs
-
-
-def set_seed(seed):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-
-
-def test_sampling(model, device, tokens=40):
-    hps = Hyperparams()
-    hps.device = device
-    hps.sr = 44100
-    hps.n_samples = 1
-    hps.name = "samples"
-    hps.levels = 3
-    hps.hop_fraction = [0.5, 0.5, 0.125]
-    HPARAMS_REGISTRY[f"prior_{model}"][
-        "min_duration"
-    ] = 0  # set the minium duration of the model to 0 to generate only 40 tokens
-    vqvae, *priors = MODELS[model]
-    vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length=264576)), device)  # before : 1048576, 2645888
-    top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)
-    hps.sample_length = tokens * top_prior.raw_to_tokens
-    metas = (
-        [
-            dict(
-                artist="Zac Brown Band",
-                genre="Country",
-                total_length=hps.sample_length,
-                offset=0,
-                lyrics="""I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone Stand in the desert. . . . Near them, on the sand, Half sunk a
-    shattered visage lies, whose frown, And wrinkled lip, and sneer of cold command, Tell that its sculptor well those
-    passions read Which yet survive, stamped on these lifeless things, The hand that mocked them, and the heart that
-    fed; And on the pedestal, these words appear: My name is Ozymandias, King of Kings; Look on my Works, ye Mighty,
-    and despair! Nothing beside remains. Round the decay Of that colossal Wreck, boundless and bare The lone and level
-    sands stretch far away
-    """,
-            ),
-        ]
-        * hps.n_samples
-    )
-
-    labels = [None, None, top_prior.labeller.get_batch_labels(metas, device)]
-    sampling_kwargs = get_args(model)
-    hps.sample_length = tokens * top_prior.raw_to_tokens
-
-    set_seed(0)
-    zs = [t.zeros(hps.n_samples, 0, dtype=t.long, device=device) for _ in range(len(priors))]
-    zs = _sample(zs, labels, sampling_kwargs, [None, None, top_prior], [2], hps)
-    log_zs(zs, 2, f"{model}-{device}")
-
-    del top_prior
-    empty_cache()
-    upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, device) for prior in priors[:-1]]
-    labels[:2] = [prior.labeller.get_batch_labels(metas, device) for prior in upsamplers]
-
-    set_seed(0)
-    zs[-1] = torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).to(device)), dim=-1).long()
-    hps.sample_length = tokens * upsamplers[1].raw_to_tokens
-    zs = _sample(zs, labels, sampling_kwargs, [None, upsamplers[1], None], [1], hps)
-    log_zs(zs, 1, f"{model}-{device}")
-
-    set_seed(0)
-    hps.sample_length = tokens * upsamplers[0].raw_to_tokens
-    zs[-2] = torch.cat((zs[-2], torch.zeros(1, 1000000 - zs[-2].shape[-1]).to(device)), dim=-1).long()
-    zs = _sample(zs, labels, sampling_kwargs, [upsamplers[0], None, None], [0], hps)
-    log_zs(zs, 0, f"{model}-{device}")
-
-    empty_cache()
-    del upsamplers
-
-
-def test_prime_samling(model, device, tokens=40):
-    hps = Hyperparams()
-    hps.device = device
-    hps.sr = 44100
-    hps.n_samples = 1
-    hps.name = "samples"
-    hps.levels = 3
-    hps.hop_fraction = [0.5, 0.5, 0.125]
-    HPARAMS_REGISTRY[f"prior_{model}"]["min_duration"] = 0
-    vqvae, *priors = MODELS[model]
-    vqvae = make_vqvae(setup_hparams(vqvae, dict(sample_length=264576)), device)  # before : 1048576, 2645888
-    top_prior = make_prior(setup_hparams(priors[-1], dict()), vqvae, device)
-    hps.sample_length = tokens * top_prior.raw_to_tokens
-    metas = (
-        [
-            dict(
-                artist="Zac Brown Band",
-                genre="Country",
-                total_length=hps.sample_length,
-                offset=0,
-                lyrics="""I met a traveller from an antique land,
-    Who said "Two vast and trunkless legs of stone Stand in the desert. . . . Near them, on the sand, Half sunk a
-    shattered visage lies, whose frown, And wrinkled lip, and sneer of cold command, Tell that its sculptor well those
-    passions read Which yet survive, stamped on these lifeless things, The hand that mocked them, and the heart that
-    fed; And on the pedestal, these words appear: My name is Ozymandias, King of Kings; Look on my Works, ye Mighty,
-    and despair! Nothing beside remains. Round the decay Of that colossal Wreck, boundless and bare The lone and level
-    sands stretch far away
-    """,
-            ),
-        ]
-        * hps.n_samples
-    )
-    labels = [None, None, top_prior.labeller.get_batch_labels(metas, device)]
-    sampling_kwargs = get_args(model)
-
-    set_seed(0)
-    x = torch.rand((1, 5120, 1)).to(device)
-    vqvae.to(device)
-    zs = [None, None, top_prior.encode(x, start_level=2, bs_chunks=x.shape[0])[0].to(device)]
-    zs = _sample(zs, labels, sampling_kwargs, [None, None, top_prior], [2], hps)
-    log_zs(zs, 2, f"primed-{model}-{device}")
-
-    del top_prior
-    empty_cache()
-
-    upsamplers = [make_prior(setup_hparams(prior, dict()), vqvae, device) for prior in priors[:-1]]
-    labels = [
-        upsamplers[0].labeller.get_batch_labels(metas, device),
-        upsamplers[0].labeller.get_batch_labels(metas, device),
-        None,
-    ]
-
-    set_seed(0)
-    hps.sample_length = tokens * upsamplers[1].raw_to_tokens
-    zs = [
-        None,
-        upsamplers[-1].encode(x, start_level=1, bs_chunks=x.shape[0])[0].to(device),
-        torch.cat((zs[-1], torch.zeros(1, 1000000 - zs[-1].shape[-1]).to(device)), dim=-1).long(),
-    ]
-    zs = _sample(zs, labels, sampling_kwargs, [None, upsamplers[1], None], [1], hps)
-    log_zs(zs, 1, f"primed-{model}-{device}")
-
-    set_seed(0)
-    hps.sample_length = tokens * upsamplers[0].raw_to_tokens
-    zs = [
-        upsamplers[-1].encode(x, start_level=0, bs_chunks=x.shape[0])[0].to(device),
-        torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[1]).to(device)), dim=-1).long(),
-        torch.zeros(1, 1000000).to(device).long(),
-    ]
-    zs = _sample(zs, labels, sampling_kwargs, [upsamplers[0], None, None], [0], hps)
-    log_zs(zs, 0, f"primed-{model}-{device}")
-
-
-test_sampling("1b_lyrics", "cpu")
-test_sampling("1b_lyrics", "cuda")
-test_sampling("5b_lyrics", "cpu", tokens=60)
-test_sampling("5b_lyrics", "cuda", tokens=60)
-
-test_prime_samling("1b_lyrics", "cpu")
-test_prime_samling("1b_lyrics", "cuda")
-test_prime_samling("5b_lyrics", "cpu", tokens=60)
-test_prime_samling("5b_lyrics", "cuda", tokens=60)

From 7ea6fadbd5f4321d5c4ae88ede107bed62f5cfc5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 18:03:18 +0000
Subject: [PATCH 168/196] nit

---
 src/transformers/models/jukebox/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index f4fd73cb9c048..8272d8204450f 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -37,6 +37,7 @@
         "JukeboxModel",
         "JukeboxPreTrainedModel",
         "JukeboxVQVAE",
+        "JukeboxPrior",
     ]
 
 if TYPE_CHECKING:
@@ -53,6 +54,7 @@
             JUKEBOX_PRETRAINED_MODEL_ARCHIVE_LIST,
             JukeboxModel,
             JukeboxPreTrainedModel,
+            JukeboxPrior,
             JukeboxVQVAE,
         )
 

From b4c86496c46c7da611774287e3df6575c4d6c0a0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 18:09:33 +0000
Subject: [PATCH 169/196] fix doc build

---
 docs/source/en/model_doc/jukebox.mdx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index ca677ea972d0f..4774cf7dd4283 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -50,7 +50,6 @@ The original code can be found [here](https://github.com/openai/jukebox).
 
 [[autodoc]] JukeboxPrior
     - sample
-    - primed_sample
     - forward
 
 ## JukeboxModel

From 15121b238cf06ee4c6345b4594aa0daef4eb619b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 19:18:05 +0000
Subject: [PATCH 170/196] clean tests

---
 docs/source/en/model_doc/jukebox.mdx          |  2 +-
 tests/models/jukebox/test_modeling_jukebox.py | 60 ++++++++-----------
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 4774cf7dd4283..461307d38d56c 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -29,7 +29,7 @@ The metadata such as *artist, genre and timing* are passed to each prior, in the
 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
 
 Tips:
-- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory.
+- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face traineer!
 - This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, either use accelerate or refer the the example notbook which should provide a wrapper.
 - Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
 - Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 91919ce20aae9..6078022f7be7b 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -156,21 +156,20 @@ def test_sampling(self):
 
         set_seed(0)
         zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[-2][0], torch.tensor(self.EXPECTED_OUTPUT_1))
+        torch.testing.assert_allclose(zs[1][0], torch.tensor(self.EXPECTED_OUTPUT_1))
 
         set_seed(0)
         zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
         torch.testing.assert_allclose(zs[2][0], torch.tensor(self.EXPECTED_OUTPUT_0))
 
     @slow
-    def test_slow_sampling(self):
+    def test_conditioning(self):
         torch.backends.cuda.matmul.allow_tf32 = False
-
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
 
-        labels = [i.cuda() for i in self.prepare_inputs()]
+        labels = self.prepare_inputs()
         set_seed(0)
-        zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
+        zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
 
         top_prior = model.priors[0]
         start = 0
@@ -178,9 +177,9 @@ def test_slow_sampling(self):
         metadata = top_prior.get_metadata(labels[0].clone(), start, 1058304, 0)
 
         self.assertIsNone(music_token_conds)
-        self.assertListEqual(metadata.cpu().numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
+        self.assertListEqual(metadata.numpy()[0][:10].tolist(), self.EXPECTED_Y_COND)
 
-        audio_conditioning, metadata_conditioning, lyric_tokens = top_prior.get_cond(music_token_conds, metadata.cpu())
+        audio_conditioning, metadata_conditioning, lyric_tokens = top_prior.get_cond(music_token_conds, metadata)
         torch.testing.assert_allclose(
             audio_conditioning[0][0][:30].detach(), torch.tensor(self.EXPECTED_AUDIO_COND), atol=1e-4, rtol=1e-4
         )
@@ -190,13 +189,18 @@ def test_slow_sampling(self):
         torch.testing.assert_allclose(
             lyric_tokens[0, :30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol=1e-4, rtol=1e-4
         )
+    @slow
+    def test_slow_sampling(self):
+        torch.backends.cuda.matmul.allow_tf32 = False
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
+        labels = [i.cuda() for i in self.prepare_inputs()]
 
         set_seed(0)
-        top_prior.cuda()
+        model.priors[0].cuda()
         zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
         zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
         torch.testing.assert_allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
-        top_prior.cpu()
+        model.priors[0].cpu()
 
         set_seed(0)
         model.priors[1].cuda()
@@ -216,35 +220,23 @@ def test_primed_sampling(self):
         model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
         set_seed(0)
         waveform = torch.rand((1, 5120, 1))
-        tokens = [i.cuda() for i in self.prepare_inputs()]
+        tokens = [i for i in self.prepare_inputs()]
 
-        model.priors[0].cuda()
-        zs = [model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0].cuda(), None, None]
+        zs = [model.vqvae.encode(waveform, start_level=2, bs_chunks=waveform.shape[0])[0], None, None]
         zs = model._sample(
             zs, tokens, sample_levels=[0], save_results=False, sample_length=40 * model.priors[0].raw_to_tokens
         )
-        torch.testing.assert_allclose(zs[0][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_0))
-        model.priors[0].cpu()
+        torch.testing.assert_allclose(zs[0][0][:40], torch.tensor(self.EXPECTED_PRIMED_0))
 
-        model.priors[1].cuda()
-        upper_2 = torch.cat((zs[0], torch.zeros(1, 1000000 - zs[0].shape[-1]).cuda()), dim=-1).long()
-        zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0].cuda(), None]
-        zs = model._sample(
-            zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[-2].raw_to_tokens
-        )
-        torch.testing.assert_allclose(zs[1][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_1))
-        model.priors[1].cpu()
 
-        model.priors[2].cuda()
-        upper_1 = torch.cat((zs[1], torch.zeros(1, 1000000 - zs[1].shape[-1]).cuda()), dim=-1).long()
-        zs = [
-            upper_2,
-            upper_1,
-            model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0].cuda(),
-        ]
-        zs = model._sample(
-            zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
-        )
+        upper_2 = torch.cat((zs[0], torch.zeros(1, 2048 - zs[0].shape[-1])), dim=-1).long()
+        zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0], None]
+        zs = model._sample(zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[1].raw_to_tokens)
+        torch.testing.assert_allclose(zs[1][0][:40], torch.tensor(self.EXPECTED_PRIMED_1))
+
+        upper_1 = torch.cat((zs[1], torch.zeros(1, 2048 - zs[1].shape[-1])), dim=-1).long()
+        zs = [upper_2,upper_1,model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0]]
+        zs = model._sample(zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens)
         torch.testing.assert_allclose(zs[2][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
 
     @slow
@@ -291,7 +283,7 @@ class Jukebox5bModelTester(unittest.TestCase):
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1272
+        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1237
     ]
 
     EXPECTED_OUTPUT_1 = [
@@ -382,7 +374,7 @@ def test_slow_sampling(self):
 
     @slow
     def test_fp16_slow_sampling(self):
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval().half().to("cuda").half()
+        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval().half().to("cuda")
         labels = [i.cuda() for i in self.prepare_inputs(self.model_id)]
 
         set_seed(0)

From c5f12cd76750a0b8b5617a2a0309d4872ddc88c2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 19:19:37 +0000
Subject: [PATCH 171/196] pretty TQDM : leave = False everywhere

---
 src/transformers/models/jukebox/modeling_jukebox.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 16ed8907a35a4..c1f08f7c1d874 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1474,7 +1474,7 @@ def sample(
             if get_preds:
                 preds = []
 
-            iter = tqdm(range(0, sample_tokens))
+            iter = tqdm(range(0, sample_tokens), leave = False)
             for sample_t in iter:
                 iter.set_description(f"Ancestral sampling {sample_tokens} music tokens", refresh=True)
                 hidden_states, cond = self.get_emb(
@@ -1589,7 +1589,7 @@ def primed_sample(
             # the input of the encoder and decoder can be merged into (lyrics, music tokens)
             input_tokens = sampled_audio[-1]
 
-            iter = tqdm(range(len(sampled_audio), sample_tokens))
+            iter = tqdm(range(len(sampled_audio), sample_tokens), leave = False)
             for sample_t in iter:
                 iter.set_description(f"Primed sampling {len(iter)} music tokens", refresh=True)
                 hidden_states, cond = self.get_emb(
@@ -2395,7 +2395,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         music_tokens_conds_list = self.split_batch(music_tokens_conds, n_samples, max_batch_size)
         metadata_list = self.split_batch(metadata, n_samples, max_batch_size)
         tokens = []
-        iterator = tqdm(zip(music_tokens_list, music_tokens_conds_list, metadata_list))
+        iterator = tqdm(zip(music_tokens_list, music_tokens_conds_list, metadata_list), leave = False)
         for music_tokens_i, music_tokens_conds_i, metadata_i in iterator:
             iterator.set_description(f"Sampling windows of {sample_tokens}")
             tokens_i = prior.sample(
@@ -2416,7 +2416,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
     # Sample total_length tokens at level=level with hop_length=hop_length
     def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length):
         if total_length >= self.priors[level].n_ctx:
-            iterator = tqdm(get_starts(total_length, self.priors[level].n_ctx, hop_length))
+            iterator = tqdm(get_starts(total_length, self.priors[level].n_ctx, hop_length), leave = False)
             for start in get_starts(total_length, self.priors[level].n_ctx, hop_length):
                 iterator.set_description(
                     f"[prior level {level}] Sampling {self.priors[level].n_ctx}/{total_length} tokens", refresh=True

From 52209bf82a8e6635c3c2554908cc0d33a3a3acc7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 7 Nov 2022 19:20:09 +0000
Subject: [PATCH 172/196] style

---
 src/transformers/models/jukebox/modeling_jukebox.py |  8 ++++----
 tests/models/jukebox/test_modeling_jukebox.py       | 12 ++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index c1f08f7c1d874..2e7ac88162b38 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1474,7 +1474,7 @@ def sample(
             if get_preds:
                 preds = []
 
-            iter = tqdm(range(0, sample_tokens), leave = False)
+            iter = tqdm(range(0, sample_tokens), leave=False)
             for sample_t in iter:
                 iter.set_description(f"Ancestral sampling {sample_tokens} music tokens", refresh=True)
                 hidden_states, cond = self.get_emb(
@@ -1589,7 +1589,7 @@ def primed_sample(
             # the input of the encoder and decoder can be merged into (lyrics, music tokens)
             input_tokens = sampled_audio[-1]
 
-            iter = tqdm(range(len(sampled_audio), sample_tokens), leave = False)
+            iter = tqdm(range(len(sampled_audio), sample_tokens), leave=False)
             for sample_t in iter:
                 iter.set_description(f"Primed sampling {len(iter)} music tokens", refresh=True)
                 hidden_states, cond = self.get_emb(
@@ -2395,7 +2395,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         music_tokens_conds_list = self.split_batch(music_tokens_conds, n_samples, max_batch_size)
         metadata_list = self.split_batch(metadata, n_samples, max_batch_size)
         tokens = []
-        iterator = tqdm(zip(music_tokens_list, music_tokens_conds_list, metadata_list), leave = False)
+        iterator = tqdm(zip(music_tokens_list, music_tokens_conds_list, metadata_list), leave=False)
         for music_tokens_i, music_tokens_conds_i, metadata_i in iterator:
             iterator.set_description(f"Sampling windows of {sample_tokens}")
             tokens_i = prior.sample(
@@ -2416,7 +2416,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
     # Sample total_length tokens at level=level with hop_length=hop_length
     def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length):
         if total_length >= self.priors[level].n_ctx:
-            iterator = tqdm(get_starts(total_length, self.priors[level].n_ctx, hop_length), leave = False)
+            iterator = tqdm(get_starts(total_length, self.priors[level].n_ctx, hop_length), leave=False)
             for start in get_starts(total_length, self.priors[level].n_ctx, hop_length):
                 iterator.set_description(
                     f"[prior level {level}] Sampling {self.priors[level].n_ctx}/{total_length} tokens", refresh=True
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 6078022f7be7b..3521a0efe0fb2 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -189,6 +189,7 @@ def test_conditioning(self):
         torch.testing.assert_allclose(
             lyric_tokens[0, :30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol=1e-4, rtol=1e-4
         )
+
     @slow
     def test_slow_sampling(self):
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -228,15 +229,18 @@ def test_primed_sampling(self):
         )
         torch.testing.assert_allclose(zs[0][0][:40], torch.tensor(self.EXPECTED_PRIMED_0))
 
-
         upper_2 = torch.cat((zs[0], torch.zeros(1, 2048 - zs[0].shape[-1])), dim=-1).long()
         zs = [upper_2, model.vqvae.encode(waveform, start_level=1, bs_chunks=waveform.shape[0])[0], None]
-        zs = model._sample(zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[1].raw_to_tokens)
+        zs = model._sample(
+            zs, tokens, sample_levels=[1], save_results=False, sample_length=40 * model.priors[1].raw_to_tokens
+        )
         torch.testing.assert_allclose(zs[1][0][:40], torch.tensor(self.EXPECTED_PRIMED_1))
 
         upper_1 = torch.cat((zs[1], torch.zeros(1, 2048 - zs[1].shape[-1])), dim=-1).long()
-        zs = [upper_2,upper_1,model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0]]
-        zs = model._sample(zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens)
+        zs = [upper_2, upper_1, model.vqvae.encode(waveform, start_level=0, bs_chunks=waveform.shape[0])[0]]
+        zs = model._sample(
+            zs, tokens, sample_levels=[2], save_results=False, sample_length=40 * model.priors[2].raw_to_tokens
+        )
         torch.testing.assert_allclose(zs[2][0][:40].cpu(), torch.tensor(self.EXPECTED_PRIMED_2))
 
     @slow

From 897b21738e031ebdcf7a6e336f6510a8f1617f55 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 8 Nov 2022 13:26:46 +0000
Subject: [PATCH 173/196] update tests

---
 tests/models/jukebox/test_modeling_jukebox.py | 47 +------------------
 1 file changed, 2 insertions(+), 45 deletions(-)

diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 3521a0efe0fb2..e0d77e642a578 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -28,7 +28,7 @@
 @require_torch
 class Jukebox1bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "/home/arthur_huggingface_co/transformers/jukebox-1b-lyrics-converted"
+    model_id = "openai/jukebox-1b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -72,24 +72,6 @@ class Jukebox1bModelTester(unittest.TestCase):
 
     EXPECTED_Y_COND = [1058304, 0, 786432, 7169, 507, 76, 27, 40, 30, 76]
 
-    EXPECTED_GPU_OUTPUTS_0 = [
-        591, 1979, 89, 1332, 1572, 755, 844, 1022, 234, 1174, 1962, 1174,
-        1755, 676, 58, 1756, 844, 739, 185, 1332, 806, 1180, 774, 842,
-        306, 442, 1797, 734, 1081, 109, 806, 1492, 926, 2008, 844, 2008,
-        992, 89, 1353, 637
-    ]
-    EXPECTED_GPU_OUTPUTS_1 = [
-        1125, 2037, 317, 1372, 2037, 851, 1274, 1125, 642, 502, 1274, 851,
-        1125, 502, 317, 1125, 880, 904, 317, 1125, 642, 502, 844, 851,
-        416, 317, 1585, 642, 1125, 58, 697, 1125, 1585, 2037, 502, 2037,
-        851, 317, 1125, 642
-    ]
-    EXPECTED_GPU_OUTPUTS_2 = [
-        1489, 1489, 324, 1489, 1600, 1150, 1489, 1489, 947, 1357, 1600, 1417,
-        1481, 1003, 141, 1165, 1303, 904, 303, 1369, 395, 461, 994, 1283,
-        269, 35, 1699, 241, 1369, 35, 1303, 583, 825, 1941, 1089, 1944,
-        581, 35, 1153, 1153
-    ]
     EXPECTED_PRIMED_0 = [
         390, 1160, 1002, 1907, 1788, 1788, 1788, 1907, 1002, 1002, 1854, 1002,
         1002, 1002, 1002, 1002, 1002, 1160, 1160, 1606, 596, 596, 1160, 1002,
@@ -190,30 +172,6 @@ def test_conditioning(self):
             lyric_tokens[0, :30].detach(), torch.tensor(self.EXPECTED_LYRIC_COND), atol=1e-4, rtol=1e-4
         )
 
-    @slow
-    def test_slow_sampling(self):
-        torch.backends.cuda.matmul.allow_tf32 = False
-        model = JukeboxModel.from_pretrained(self.model_id, min_duration=0).eval()
-        labels = [i.cuda() for i in self.prepare_inputs()]
-
-        set_seed(0)
-        model.priors[0].cuda()
-        zs = [torch.zeros(1, 0, dtype=torch.long).cuda() for _ in range(3)]
-        zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[0][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_2))
-        model.priors[0].cpu()
-
-        set_seed(0)
-        model.priors[1].cuda()
-        zs = model._sample(zs, labels, [1], sample_length=40 * model.priors[1].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[1][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_1))
-        model.priors[1].cpu()
-
-        set_seed(0)
-        model.priors[2].cuda()
-        zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[2].raw_to_tokens, save_results=False)
-        torch.testing.assert_allclose(zs[2][0].cpu(), torch.tensor(self.EXPECTED_GPU_OUTPUTS_0))
-
     @slow
     def test_primed_sampling(self):
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -260,7 +218,7 @@ def test_vqvae(self):
 @require_torch
 class Jukebox5bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "/home/arthur_huggingface_co/transformers/jukebox-5b-lyrics-converted"
+    model_id = "openai/jukebox-5b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -327,7 +285,6 @@ class Jukebox5bModelTester(unittest.TestCase):
         307, 89, 1353, 616, 34, 842, 185, 842, 34, 842, 185, 842,
         307, 114, 185, 89, 34, 1268, 185, 89, 34, 842, 185, 89
     ]
-
     # fmt: on
 
     def prepare_inputs(self, model_id):

From af876a7a2c1618ac6e10142b68ba51f2021073c7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 8 Nov 2022 13:37:23 +0000
Subject: [PATCH 174/196] add missing documentation

---
 .../models/jukebox/modeling_jukebox.py        | 41 +++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2e7ac88162b38..cc7380d735df5 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2079,8 +2079,29 @@ def sample(
         sample_tokens=None,
     ):
         """
-        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas. music_tokens :
-        previously sampled music tokens that are attended to by the prior.
+        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas.
+
+        Args:
+            n_samples (`int`):
+                Number of samples to generate.
+            music_tokens (`List[`torch.LongTensor`]`, *optional*, defaults to None):
+                Previously gemerated tokens at the current level. Used as context for the generation.
+            music_tokens_conds (`List[`torch.FloatTensor`]`, *optional*, defaults to None):
+                Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
+                conditionned on the upper-level tokens.
+            metadata (`List[`torch.LongTensor`]`, *optional*, defaults to None):
+                List containing the metatdata tensor with the artist, genre and the lyric tokens.
+            temp (`float`, *optional*, defaults to 1.0):
+                Sampling temperature.
+            top_k (`int`, *optional*, defaults to 0):
+                Top k probabilities used for filtering.
+            top_p (`float`, *optional*, defaults to 0.0):
+                Top p probabilities used for filtering.
+            chunk_size (`int`, *optional*, defaults to None):
+                Size of the chunks used to prepare the cache of the transformer.
+            sample_tokens (`int`, *optional*, defaults to None):
+                Number of tokens to sample.
+
         """
         no_past_context = music_tokens is None or music_tokens.shape[1] == 0
         name = {True: "Ancestral", False: "Primed"}[no_past_context]
@@ -2219,6 +2240,20 @@ def forward_tokens(
             return loss, metrics
 
     def forward(self, hidden_states, metadata=None, decode=False, get_preds=False):
+        """
+        Encode the hidden states using the `vqvae` encoder, and then predicts the next token in the `forward_tokens`
+        function. The loss is the sum of the `encoder` loss and the `decoder` loss.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                Hidden states which should be raw audio
+            metadata (`List[`torch.LongTensor`]`, *optional*, defaults to None):
+                List containing the metadata conditioning tensorwith the lyric and the metadata tokens.
+            decode (`bool`, *optional*, defaults to False):
+                Whether or not to decode the encoded to tokens.
+            get_preds (`bool`, *optional*, defaults to False):
+                Whether or not to return the actual predicitons of the model.
+        """
         batch_size = hidden_states.shape[0]
         music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size)
         loss, metrics = self.forward_tokens(
@@ -2290,7 +2325,7 @@ def __init__(self, *inputs, **kwargs):
 @add_start_docstrings(
     """The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`,
 `continue_sample` and `ancestral_sample`.
-    It does not have a `forward` method as the training is not end to end. If you want to fine tune the model, it is
+    It does not have a `forward` method as the training is not end to end. If you want to fine-tune the model, it is
     recommended to use the `JukeboxPrior` class and train each prior individually.
     """,
     JUKEBOX_START_DOCSTRING,

From 0280041219938db81c158986c0a32ae439618cbc Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 9 Nov 2022 10:14:47 +0100
Subject: [PATCH 175/196] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/en/model_doc/jukebox.mdx          |  2 +-
 .../models/jukebox/configuration_jukebox.py   | 58 +++++++++----------
 .../models/jukebox/modeling_jukebox.py        | 48 ++++++---------
 .../models/jukebox/tokenization_jukebox.py    |  4 +-
 4 files changed, 51 insertions(+), 61 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 461307d38d56c..deef48685397c 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -46,7 +46,7 @@ The original code can be found [here](https://github.com/openai/jukebox).
 [[autodoc]] JukeboxTokenizer
     - save_vocabulary
 
-## JukeboxModel
+## JukeboxPrior
 
 [[autodoc]] JukeboxPrior
     - sample
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 73c584ebb2af3..8ad1a9b22200b 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -142,7 +142,7 @@ class JukeboxPriorConfig(PretrainedConfig):
 
 
     Args:
-        act_fn (`str`, *optional*, defaults to "quick_gelu"):
+        act_fn (`str`, *optional*, defaults to `"quick_gelu"`):
             Activation function.
         alignment_head (`int`, *optional*, defaults to 2):
             Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
@@ -153,16 +153,16 @@ class JukeboxPriorConfig(PretrainedConfig):
         attention_multiplier (`float`, *optional*, defaults to 0.25):
             Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
             0.25*width of the model will be used.
-        attention_pattern (`str`, *optional*, defaults to "enc_dec_with_lyrics"):
+        attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`):
             Which attention pattern to use for the decoder/
         attn_dropout (`int`, *optional*, defaults to 0):
             Dropout probability for the post-attention layer dropout in the decoder.
-        attn_res_scale (`bool`, *optional*, defaults to False):
+        attn_res_scale (`bool`, *optional*, defaults to `False`):
             Wheter or not to scale the residuals in the attention conditionner block.
         blocks (`int`, *optional*, defaults to 64):
-            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len //
-            blocks] in the `JukeboxAttention` layer.
-        conv_res_scale (`int`, *optional*, defaults to None):
+            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
+            blocks]` in the `JukeboxAttention` layer.
+        conv_res_scale (`int`, *optional*):
             Wheter or not to scale the residuals in the conditionner block. Since the top level prior doeas not have a
             conditionner, the default value is to None and should not be modified.
         depth (`int`, *optional*, defaults to 72):
@@ -174,15 +174,15 @@ class JukeboxPriorConfig(PretrainedConfig):
         encoder_attention_multiplier (`float`, *optional*, defaults to 0.25):
             Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
             0.25*width of the model will be used.
-        encoder_attention_pattern (`str`, *optional*, defaults to "RawColumnPreviousRowAttention"):
+        encoder_attention_pattern (`str`, *optional*, defaults to `"RawColumnPreviousRowAttention"`):
             Which attention pattern to use for the lyric encoder.
         encoder_attn_dropout (`float`, *optional*, defaults to 0.0):
             Dropout probability for the post-attention layer dropout in the lyric encoder.
-        encoder_attn_res_scale (`bool`, *optional*, defaults to False):
+        encoder_attn_res_scale (`bool`, *optional*, defaults to `False`):
             Wheter or not to scale the residuals in the attention conditionner block.
         encoder_blocks (`int`, *optional*, defaults to 32):
-            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len //
-            blocks] in the `JukeboxAttention` layer.
+            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
+            blocks]` in the `JukeboxAttention` layer.
         encoder_depth (`int`, *optional*, defaults to 18):
             Depth of the encoder model.
         encoder_emb_dropout (`float`, *optional*, defaults to 0.0):
@@ -191,7 +191,7 @@ class JukeboxPriorConfig(PretrainedConfig):
             Number of heads in the lyric encoder
         encoder_init_scale (`float`, *optional*, defaults to 0.1):
             Initialisation scales for the lyric encoder modules.
-        encoder_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
+        encoder_loss_fraction (`list`, *optional*, defaults to `[0.4, 0.0, 0.0]`):
             Multiplication factor used in front of the lyric encoder loss. Each value is for a particular level.
         encoder_mlp_multiplier (`float`, *optional*, defaults to 1.0):
             Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
@@ -201,7 +201,7 @@ class JukeboxPriorConfig(PretrainedConfig):
             `encoder`.
         encoder_resid_dropout (`float`, *optional*, defaults to 0.0):
             Residual dropout used in the attention pattern of the lyric encoder.
-        encoder_spread (`int`, *optional*, defaults to None):
+        encoder_spread (`int`, *optional*):
             Spread used in the `summary_spread_attention` pattern
         encoder_width (`int`, *optional*, defaults to 128):
             Width of the lyric encoder if `is_encoder_decoder=False` and `nb_relevant_lyric_tokens>0`
@@ -209,21 +209,21 @@ class JukeboxPriorConfig(PretrainedConfig):
             Whether or not to set to zeros the weights the convolutions in the lyric encoder.
         init_scale (`float`, *optional*, defaults to 0.2):
             Initialisation scales for the prior modules.
-        is_encoder_decoder (`bool`, *optional*, defaults to True):
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
             Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
             greater than 0, the `encoder` args should be specified for the lyric encoding.
-        mask (`bool`, *optional*, defaults to False):
+        mask (`bool`, *optional*, defaults to `False`):
             Whether or not to mask the previous positions in the attention.
         max_duration (`int`, *optional*, defaults to 600):
             _description_
         max_nb_genres (`int`, *optional*, defaults to 1):
             _description_
-        merged_decoder (`bool`, *optional*, defaults to True):
+        merged_decoder (`bool`, *optional*, defaults to `True`):
             Whether or not the decoder and the encoder inputs are merged. This is used for the seperated
             encoder-decoder architecture
         metadata_conditioning (`bool`, *optional*, defaults to True):
             _description_
-        metadata_dims (`tuple(int)`, *optional*, defaults to (604, 7898)):
+        metadata_dims (`tuple(int)`, *optional*, defaults to `(604, 7898)`):
             Number of genres and the number of artists that were used to train the embedding layers of the prior
             models.
         min_duration (`int`, *optional*, defaults to 0):
@@ -252,9 +252,9 @@ class JukeboxPriorConfig(PretrainedConfig):
             tokens.
         res_dilation_growth_rate (`int`, *optional*, defaults to 1):
             Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
-        res_downs_t (`tuple(int)`, *optional*, defaults to (3, 2, 2)):
+        res_downs_t (`tuple(int)`, *optional*, defaults to `(3, 2, 2)`):
             Downsampling rates used in the audio conditioning network
-        res_strides_t (`tuple(int)`, *optional*, defaults to (2, 2, 2)):
+        res_strides_t (`tuple(int)`, *optional*, defaults to `(2, 2, 2)`):
             Striding used in the audio conditioning network
         resid_dropout (`int`, *optional*, defaults to 0):
             Residual dropout used in the attention pattern.
@@ -266,7 +266,7 @@ class JukeboxPriorConfig(PretrainedConfig):
             _description_
         width (`int`, *optional*, defaults to 2048):
             Dimension of the attention layers. # TODO this is a bit confusing
-        zero_out (`bool`, *optional*, defaults to False):
+        zero_out (`bool`, *optional*, defaults to `False`):
             Whether or not to zero out convolution weights when initialising.
     """
 
@@ -422,7 +422,7 @@ class JukeboxVQVAEConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        act_fn (`str`, *optional*, defaults to "relu"):
+        act_fn (`str`, *optional*, defaults to `"relu"`):
             _description_
         codebook_dimension (`int`, *optional*, defaults to 2048):
             Number of codes to use in each of the VQVAE.
@@ -430,18 +430,18 @@ class JukeboxVQVAEConfig(PretrainedConfig):
             Commit loss multiplier.
         conv_input_shape (`int`, *optional*, defaults to 1):
             Number of audio channels.
-        conv_res_scale (`bool`, *optional*, defaults to False):
+        conv_res_scale (`bool`, *optional*, defaults to `False`):
             _description_
         embed_dim (`int`, *optional*, defaults to 64):
             Embeding dimension of the codebook vectors.
-        hop_fraction (`List[`int`]`, *optional*, defaults to [0.125, 0.5, 0.5]):
+        hop_fraction (`List[`int`]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
             Fraction of non-intersecting window used when continuing the sampling process.
         levels (`int`, *optional*, defaults to 3):
             Number of hierachical levels that used in the VQVAE.
         lmu (`float`, *optional*, defaults to 0.99):
             Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
             of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
-        multipliers (`tuple`, *optional*, defaults to (2, 1, 1)):
+        multipliers (`tuple`, *optional*, defaults to `(2, 1, 1)`):
             Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
         res_conv_depth (`int`, *optional*, defaults to 4):
             Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
@@ -449,15 +449,15 @@ class JukeboxVQVAEConfig(PretrainedConfig):
             Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
         res_convolution_multiplier (`int`, *optional*, defaults to 1):
             Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
-        res_dilation_cycle (`_type_`, *optional*, defaults to None):
+        res_dilation_cycle (`_type_`, *optional*):
                 Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a
                 depth
             of reduced by a power of `res_dilation_cycle`.
         res_dilation_growth_rate (`int`, *optional*, defaults to 3):
             Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
-        res_downs_t (`tuple(int)`, *optional*, defaults to (3, 2, 2)):
+        res_downs_t (`tuple(int)`, *optional*, defaults to `(3, 2, 2)`):
                 Downsampling rate for each level of the hierachical VQ-VAE.
-        res_strides_t (`tuple(int)`, *optional*, defaults to (2, 2, 2)):
+        res_strides_t (`tuple(int)`, *optional*, defaults to `(2, 2, 2)`):
             Stride used for each level of the hierachical VQ-VAE.
         sample_length (`int`, *optional*, defaults to 1058304):
             Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
@@ -540,9 +540,9 @@ class JukeboxConfig(PretrainedConfig):
     to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
 
     Args:
-        vqvae_config (`JukeboxVQVAEConfig`, *optional*, defaults to None):
+        vqvae_config (`JukeboxVQVAEConfig`, *optional*):
             _description_
-        prior_config_list (`List[`JukeboxPriorConfig`]`, *optional*, defaults to None):
+        prior_config_list (`List[`JukeboxPriorConfig`]`, *optional*):
             _description_
         nb_priors (`int`, *optional*, defaults to 3):
             Number of prior models that will sequentialy sample tokens. Each prior is conditional auto regressive
@@ -560,7 +560,7 @@ class JukeboxConfig(PretrainedConfig):
             Maximum duration of the audios to generate
         max_nb_genres (`int`, *optional*, defaults to 5):
             Maximum number of genres that can be used to condition a single sample.
-        metadata_conditioning (`bool`, *optional*, defaults to True):
+        metadata_conditioning (`bool`, *optional*, defaults to `True`):
             Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
             duration.
         init_std (`float`, *optional*, defaults to 0.2):
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index cc7380d735df5..59dc4f7d0fff1 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -471,7 +471,7 @@ def quantise(self, latent_states):
             torch.sum(latent_states**2, dim=-1, keepdim=True)
             - 2 * torch.matmul(latent_states, codebook_weights)
             + torch.sum(codebook_weights**2, dim=0, keepdim=True)
-        )  # (batch_size * latent_states , codebook_weights) # better help from this comment
+        )  # (batch_size * latent_states , codebook_weights)
         min_distance, music_tokens = torch.min(distance, dim=-1)
         fit = torch.mean(min_distance)
         return music_tokens, fit
@@ -727,12 +727,10 @@ def forward(self, raw_audio):
         The commit loss, which ensure that the encoder's computed embeddings are close to the codebook vectors, is
         computed.
 
-
         Args:
             raw_audio (`torch.FloatTensor`):
                 Audio input which will be encoded and decoded.
 
-
         Returns:
             `Tuple[torch.Tensor, torch.Tensoor`
 
@@ -780,7 +778,7 @@ def __init__(self, config):
         self.c_fc = JukeboxConv1D(embed_dim, hidden_dim)
         self.c_proj = JukeboxConv1D(hidden_dim, embed_dim)
         self.act = ACT2FN[config.act_fn]
-        self.dropout = nn.Dropout(config.resid_dropout) if config.resid_dropout > 0.0 else lambda x: x
+        self.dropout = nn.Dropout(config.resid_dropout)
 
     def forward(self, hidden_states):
         hidden_states = self.c_fc(hidden_states)
@@ -800,7 +798,7 @@ def forward(self, input):
         if input.numel() > self.max_numel:
             return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
         else:
-            return super(JukeboxLayerNorm, self).forward(input).type_as(input)
+            return super().forward(input).type_as(input)
 
 
 class JukeboxAttention(nn.Module):
@@ -2084,12 +2082,12 @@ def sample(
         Args:
             n_samples (`int`):
                 Number of samples to generate.
-            music_tokens (`List[`torch.LongTensor`]`, *optional*, defaults to None):
+            music_tokens (`List[`torch.LongTensor`]`, *optional*):
                 Previously gemerated tokens at the current level. Used as context for the generation.
-            music_tokens_conds (`List[`torch.FloatTensor`]`, *optional*, defaults to None):
+            music_tokens_conds (`List[`torch.FloatTensor`]`, *optional*):
                 Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
                 conditionned on the upper-level tokens.
-            metadata (`List[`torch.LongTensor`]`, *optional*, defaults to None):
+            metadata (`List[`torch.LongTensor`]`, *optional*):
                 List containing the metatdata tensor with the artist, genre and the lyric tokens.
             temp (`float`, *optional*, defaults to 1.0):
                 Sampling temperature.
@@ -2097,9 +2095,9 @@ def sample(
                 Top k probabilities used for filtering.
             top_p (`float`, *optional*, defaults to 0.0):
                 Top p probabilities used for filtering.
-            chunk_size (`int`, *optional*, defaults to None):
+            chunk_size (`int`, *optional*):
                 Size of the chunks used to prepare the cache of the transformer.
-            sample_tokens (`int`, *optional*, defaults to None):
+            sample_tokens (`int`, *optional*):
                 Number of tokens to sample.
 
         """
@@ -2247,11 +2245,11 @@ def forward(self, hidden_states, metadata=None, decode=False, get_preds=False):
         Args:
             hidden_states (`torch.Tensor`):
                 Hidden states which should be raw audio
-            metadata (`List[`torch.LongTensor`]`, *optional*, defaults to None):
+            metadata (`List[`torch.LongTensor`]`, *optional*):
                 List containing the metadata conditioning tensorwith the lyric and the metadata tokens.
-            decode (`bool`, *optional*, defaults to False):
+            decode (`bool`, *optional*, defaults to `False`):
                 Whether or not to decode the encoded to tokens.
-            get_preds (`bool`, *optional*, defaults to False):
+            get_preds (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the actual predicitons of the model.
         """
         batch_size = hidden_states.shape[0]
@@ -2338,9 +2336,7 @@ def __init__(self, config):
         vqvae_config = config.vqvae_config
         self.vqvae = JukeboxVQVAE(vqvae_config)
         self.set_shared_params(config)
-        self.priors = nn.ModuleList()
-        for level in range(config.nb_priors):
-            self.priors.append(JukeboxPrior(config.prior_configs[level], level))
+        self.priors = nn.ModuleList([JukeboxPrior(config.prior_configs[level], level) for level in range(config.nb_priors)])
 
     def set_shared_params(self, model_config):
         """
@@ -2396,13 +2392,8 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         # get music_tokens already sampled at current level
         previous_sampled_tokens = music_tokens[level][:, start:end]
 
+        sample_tokens = sampling_kwargs.get("sample_tokens", None)
         if "sample_tokens" in sampling_kwargs:
-            # Support sampling a window shorter than n_ctx
-            sample_tokens = sampling_kwargs["sample_tokens"]
-            if sample_tokens is None:
-                sample_tokens = end - start
-
-        else:
             sample_tokens = end - start
 
         conditioning_tokens = previous_sampled_tokens.shape[1]
@@ -2490,14 +2481,13 @@ def _sample(
            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
-            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, 4 +
-            self.config.max_nb_genre + lyric_sequence_lenght)` :
+            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre + lyric_sequence_lenght)` :
                 List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
                 condition the generation.
             sample_levels (`List[int]`):
                 List of the desired levels at which the sampling will be done. A level is equivalent to the index of
                 the prior in the list of priors
-            metas (`List[Any]`, *optional*, defaults to None):
+            metas (`List[Any]`, *optional*):
                 Metadatas used to generate the `labels`
             chunk_size (`int`, *optional*, defaults to 32):
                 Size of a chunk of audio, used to fill up the memory in chuncks to prevent OOM erros. Bigger chunks
@@ -2510,18 +2500,18 @@ def _sample(
                 Maximum batch size for the top level priors
             sample_length_in_seconds (`int`, *optional*, defaults to 24):
                 Desired lenght of the generation in seconds
-            compute_alignments (`bool`, *optional*, defaults to False):
+            compute_alignments (`bool`, *optional*, defaults to `False`):
                 Whether or not to compute the alignment between the lyrics and the audio using the top_prior
-            sample_tokens (`int`, *optional*, defaults to None):
+            sample_tokens (`int`, *optional*):
                 Precise number of tokens that should be sampled at each level. This is mostly useful for running dummy
                 experiments
             offset (`int`, *optional*, defaults to 0):
                 Audio offset used as conditioning, corresponds to the starting sample in the music. If the offset is
                 greater than 0, the lyrics will be shifted take that intoaccount
-            save_results (`bool`, *optional*, defaults to True):
+            save_results (`bool`, *optional*, defaults to `True`):
                 Whether or not to save the intermediate results. If `True`, will generate a folder named with the start
                 time.
-            sample_length (`int`, *optional*, defaults to None):
+            sample_length (`int`, *optional*):
                 Desired lenght of the generation in samples.
 
         Returns: torch.Tensor
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 0091c89d80947..15b03365f9eac 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -290,7 +290,7 @@ def convert_to_tensors(
         Args:
             tensor_type (`str` or [`~utils.TensorType`], *optional*):
                 The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
-                `None`, no modification is done.
+                unset, no modification is done.
             prepend_batch_axis (`int`, *optional*, defaults to `False`):
                 Whether or not to add the batch dimension during the conversion.
         """
@@ -350,7 +350,7 @@ def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncod
                 Name of the artist.
             genres (`str`):
                 List of genres that will be mixed to condition the audio
-            lyrics (`srt`, Optional):
+            lyrics (`str`, *optional*):
                 Lyrics used to condition the generation
         """
         input_ids = [0, 0, 0]

From 9d4baabc30b7af8c7f5121fdb23cd9e774a7485e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 10:53:33 +0000
Subject: [PATCH 176/196] update doc and variable names, add encoder config in
 config

---
 docs/source/en/model_doc/jukebox.mdx          |   6 +-
 .../models/jukebox/configuration_jukebox.py   | 172 ++++-----
 .../models/jukebox/modeling_jukebox.py        | 343 ++++++++----------
 3 files changed, 215 insertions(+), 306 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 461307d38d56c..caa20d4551be0 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -22,15 +22,15 @@ The abstract from the paper is the following:
 
 *We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
 
-As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length. 
+As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length.
 First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditionner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. 
-The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. 
+The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
 
 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
 
 Tips:
 - This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face traineer!
-- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, either use accelerate or refer the the example notbook which should provide a wrapper.
+- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
 - Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
 - Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
 
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 73c584ebb2af3..9cd76775c7eb2 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -158,79 +158,49 @@ class JukeboxPriorConfig(PretrainedConfig):
         attn_dropout (`int`, *optional*, defaults to 0):
             Dropout probability for the post-attention layer dropout in the decoder.
         attn_res_scale (`bool`, *optional*, defaults to False):
-            Wheter or not to scale the residuals in the attention conditionner block.
+            Whether or not to scale the residuals in the attention conditioner block.
         blocks (`int`, *optional*, defaults to 64):
             Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len //
             blocks] in the `JukeboxAttention` layer.
         conv_res_scale (`int`, *optional*, defaults to None):
-            Wheter or not to scale the residuals in the conditionner block. Since the top level prior doeas not have a
-            conditionner, the default value is to None and should not be modified.
+            Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
+            conditioner, the default value is to None and should not be modified.
         depth (`int`, *optional*, defaults to 72):
             Number of layers of the decoder architecture. #TODO replace with num decoder_layers?
         emb_dropout (`int`, *optional*, defaults to 0):
             Embedding dropout used in the lyric decoder.
-        embed_dim (`int`, *optional*, defaults to 2048):
-            Dimension of the audio embedings. I can be different with the `width` for smaller models.
-        encoder_attention_multiplier (`float`, *optional*, defaults to 0.25):
-            Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
-            0.25*width of the model will be used.
-        encoder_attention_pattern (`str`, *optional*, defaults to "RawColumnPreviousRowAttention"):
-            Which attention pattern to use for the lyric encoder.
-        encoder_attn_dropout (`float`, *optional*, defaults to 0.0):
-            Dropout probability for the post-attention layer dropout in the lyric encoder.
-        encoder_attn_res_scale (`bool`, *optional*, defaults to False):
-            Wheter or not to scale the residuals in the attention conditionner block.
-        encoder_blocks (`int`, *optional*, defaults to 32):
-            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as [blocks, seq_len //
-            blocks] in the `JukeboxAttention` layer.
-        encoder_depth (`int`, *optional*, defaults to 18):
-            Depth of the encoder model.
-        encoder_emb_dropout (`float`, *optional*, defaults to 0.0):
-            Embedding dropout used in the lyric encoder.
-        encoder_heads (`int`, *optional*, defaults to 4):
-            Number of heads in the lyric encoder
-        encoder_init_scale (`float`, *optional*, defaults to 0.1):
-            Initialisation scales for the lyric encoder modules.
-        encoder_loss_fraction (`list`, *optional*, defaults to [0.4, 0.0, 0.0]):
-            Multiplication factor used in front of the lyric encoder loss. Each value is for a particular level.
-        encoder_mlp_multiplier (`float`, *optional*, defaults to 1.0):
-            Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
-            the model will be used.
-        encoder_n_vocab (`int`, *optional*, defaults to 79):
-            Defines the number of different lyric tokens that can be represented by the `inputs_ids` passed to the
-            `encoder`.
-        encoder_resid_dropout (`float`, *optional*, defaults to 0.0):
-            Residual dropout used in the attention pattern of the lyric encoder.
-        encoder_spread (`int`, *optional*, defaults to None):
-            Spread used in the `summary_spread_attention` pattern
-        encoder_width (`int`, *optional*, defaults to 128):
-            Width of the lyric encoder if `is_encoder_decoder=False` and `nb_relevant_lyric_tokens>0`
-        encoder_zero_out (`bool`, *optional*, defaults to False):
-            Whether or not to set to zeros the weights the convolutions in the lyric encoder.
+        encoder_config (`JukeboxPriorConfig`, *optional*) :
+            Configuration of the encoder which models the prior on the lyrics.
+        encoder_loss_fraction (`float`, *optional*, defaults to 0.4):
+            Multiplication factor used in front of the lyric encoder loss.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Hidden dimension of the attention layers.
         init_scale (`float`, *optional*, defaults to 0.2):
-            Initialisation scales for the prior modules.
+            Initialization scales for the prior modules.
         is_encoder_decoder (`bool`, *optional*, defaults to True):
             Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
             greater than 0, the `encoder` args should be specified for the lyric encoding.
         mask (`bool`, *optional*, defaults to False):
             Whether or not to mask the previous positions in the attention.
         max_duration (`int`, *optional*, defaults to 600):
-            _description_
+            #TODO FILLME
         max_nb_genres (`int`, *optional*, defaults to 1):
-            _description_
+            #TODO FILLME
         merged_decoder (`bool`, *optional*, defaults to True):
-            Whether or not the decoder and the encoder inputs are merged. This is used for the seperated
+            Whether or not the decoder and the encoder inputs are merged. This is used for the separated
             encoder-decoder architecture
         metadata_conditioning (`bool`, *optional*, defaults to True):
-            _description_
-        metadata_dims (`tuple(int)`, *optional*, defaults to (604, 7898)):
+            #TODO FILLME
+        metadata_dims (`List[`int`]`, *optional*, defaults to [604, 7898]):
             Number of genres and the number of artists that were used to train the embedding layers of the prior
             models.
         min_duration (`int`, *optional*, defaults to 0):
-            _description_
+            #TODO FILLME
         mlp_multiplier (`float`, *optional*, defaults to 1.0):
             Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
             the model will be used.
+        music_vocab_size (`int`, *optional*, defaults to 2048):
+            Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`.
         n_ctx (`int`, *optional*, defaults to 6144):
             Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
             generating music tokens.
@@ -252,9 +222,9 @@ class JukeboxPriorConfig(PretrainedConfig):
             tokens.
         res_dilation_growth_rate (`int`, *optional*, defaults to 1):
             Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
-        res_downs_t (`tuple(int)`, *optional*, defaults to (3, 2, 2)):
+        res_downs_t (`List[`int`]`, *optional*, defaults to [3, 2, 2]):
             Downsampling rates used in the audio conditioning network
-        res_strides_t (`tuple(int)`, *optional*, defaults to (2, 2, 2)):
+        res_strides_t (`List[`int`]`, *optional*, defaults to [2, 2, 2]):
             Striding used in the audio conditioning network
         resid_dropout (`int`, *optional*, defaults to 0):
             Residual dropout used in the attention pattern.
@@ -264,10 +234,8 @@ class JukeboxPriorConfig(PretrainedConfig):
             Spread used in the `summary_spread_attention` pattern
         timing_dims (`int`, *optional*, defaults to 64):
             _description_
-        width (`int`, *optional*, defaults to 2048):
-            Dimension of the attention layers. # TODO this is a bit confusing
         zero_out (`bool`, *optional*, defaults to False):
-            Whether or not to zero out convolution weights when initialising.
+            Whether or not to zero out convolution weights when initializing.
     """
 
     model_type = "jukebox"
@@ -290,34 +258,21 @@ def __init__(
         conv_res_scale=None,
         depth=72,
         emb_dropout=0,
-        embed_dim=2048,
-        encoder_attention_multiplier=0.25,
-        encoder_attention_pattern="RawColumnPreviousRowAttention",
-        encoder_attn_dropout=0.0,
-        encoder_blocks=32,
-        encoder_depth=18,
-        encoder_emb_dropout=0.0,
-        encoder_heads=4,
-        encoder_init_scale=0.1,
-        encoder_loss_fraction=[0.4, 0.0, 0.0],
-        encoder_mlp_multiplier=1.0,
-        encoder_n_vocab=79,
-        encoder_attn_res_scale=False,
-        encoder_resid_dropout=0.0,
-        encoder_spread=None,
-        encoder_width=128,
-        encoder_zero_out=False,
+        encoder_config=None,
+        encoder_loss_fraction=0.4,
+        hidden_size=2048,
         init_scale=0.2,
         is_encoder_decoder=True,
-        lyric_conditioning=True,
+        lyric_vocab_size=80,
         mask=False,
         max_duration=600,
         max_nb_genres=1,
         merged_decoder=True,
         metadata_conditioning=True,
-        metadata_dims=(604, 7898),
+        metadata_dims=[604, 7898],
         min_duration=0,
         mlp_multiplier=1.0,
+        music_vocab_size=2048,
         n_ctx=6144,
         n_heads=2,
         nb_relevant_lyric_tokens=384,
@@ -326,13 +281,12 @@ def __init__(
         res_convolution_multiplier=1,
         res_dilation_cycle=None,
         res_dilation_growth_rate=1,
-        res_downs_t=(3, 2, 2),
-        res_strides_t=(2, 2, 2),
+        res_downs_t=[3, 2, 2],
+        res_strides_t=[2, 2, 2],
         resid_dropout=0,
         sampling_rate=44100,
         spread=None,
         timing_dims=64,
-        width=2048,
         zero_out=False,
         **kwargs
     ):
@@ -348,26 +302,15 @@ def __init__(
         self.conv_res_scale = conv_res_scale
         self.depth = depth
         self.emb_dropout = emb_dropout
-        self.embed_dim = embed_dim
-        self.encoder_attention_multiplier = encoder_attention_multiplier
-        self.encoder_attention_pattern = encoder_attention_pattern
-        self.encoder_attn_dropout = encoder_attn_dropout
-        self.encoder_attn_res_scale = encoder_attn_res_scale
-        self.encoder_blocks = encoder_blocks
-        self.encoder_depth = encoder_depth
-        self.encoder_emb_dropout = encoder_emb_dropout
-        self.encoder_heads = encoder_heads
-        self.encoder_init_scale = encoder_init_scale
+        self.music_vocab_size = music_vocab_size
+        if encoder_config is not None:
+            self.encoder_config = JukeboxPriorConfig(**encoder_config)
+        else :
+            self.encoder_config = None
         self.encoder_loss_fraction = encoder_loss_fraction
-        self.encoder_mlp_multiplier = encoder_mlp_multiplier
-        self.encoder_n_vocab = encoder_n_vocab
-        self.encoder_resid_dropout = encoder_resid_dropout
-        self.encoder_spread = encoder_spread
-        self.encoder_width = encoder_width
-        self.encoder_zero_out = encoder_zero_out
         self.init_scale = init_scale
         self.is_encoder_decoder = is_encoder_decoder
-        self.lyric_conditioning = lyric_conditioning
+        self.lyric_vocab_size = lyric_vocab_size
         self.mask = mask
         self.max_duration = max_duration
         self.max_nb_genres = max_nb_genres
@@ -390,7 +333,7 @@ def __init__(
         self.sampling_rate = sampling_rate
         self.spread = spread
         self.timing_dims = timing_dims
-        self.width = width
+        self.hidden_size = hidden_size
         self.zero_out = zero_out
 
     @classmethod
@@ -398,7 +341,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
-        # get the text config dict if we are loading from CLIPConfig
+        # get the prior config dict if we are loading from JukeboxConfig
         if config_dict.get("model_type") == "jukebox_prior":
             config_dict = config_dict["prior_configs"]
 
@@ -410,6 +353,17 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
 
         return cls.from_dict(config_dict, **kwargs)
 
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["encoder_config"] = self.encoder_config.to_dict() if self.encoder_config is not None else None
+        output["model_type"] = self.__class__.model_type
+        return output
 
 class JukeboxVQVAEConfig(PretrainedConfig):
     """
@@ -424,20 +378,20 @@ class JukeboxVQVAEConfig(PretrainedConfig):
     Args:
         act_fn (`str`, *optional*, defaults to "relu"):
             _description_
-        codebook_dimension (`int`, *optional*, defaults to 2048):
-            Number of codes to use in each of the VQVAE.
+        nb_discrete_codes (`int`, *optional*, defaults to 2048):
+            Number of codes of the VQVAE.
         commit (`float`, *optional*, defaults to 0.02):
             Commit loss multiplier.
         conv_input_shape (`int`, *optional*, defaults to 1):
             Number of audio channels.
         conv_res_scale (`bool`, *optional*, defaults to False):
-            _description_
+            Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
         embed_dim (`int`, *optional*, defaults to 64):
-            Embeding dimension of the codebook vectors.
+            Embedding dimension of the codebook vectors.
         hop_fraction (`List[`int`]`, *optional*, defaults to [0.125, 0.5, 0.5]):
             Fraction of non-intersecting window used when continuing the sampling process.
         levels (`int`, *optional*, defaults to 3):
-            Number of hierachical levels that used in the VQVAE.
+            Number of hierarchical levels that used in the VQVAE.
         lmu (`float`, *optional*, defaults to 0.99):
             Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
             of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
@@ -455,10 +409,10 @@ class JukeboxVQVAEConfig(PretrainedConfig):
             of reduced by a power of `res_dilation_cycle`.
         res_dilation_growth_rate (`int`, *optional*, defaults to 3):
             Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
-        res_downs_t (`tuple(int)`, *optional*, defaults to (3, 2, 2)):
-                Downsampling rate for each level of the hierachical VQ-VAE.
-        res_strides_t (`tuple(int)`, *optional*, defaults to (2, 2, 2)):
-            Stride used for each level of the hierachical VQ-VAE.
+        res_downs_t (`List[`int`]`, *optional*, defaults to [3, 2, 2]):
+                Downsampling rate for each level of the hierarchical VQ-VAE.
+        res_strides_t (`List[`int`]`, *optional*, defaults to [2, 2, 2]):
+            Stride used for each level of the hierarchical VQ-VAE.
         sample_length (`int`, *optional*, defaults to 1058304):
             Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
     """
@@ -466,7 +420,7 @@ class JukeboxVQVAEConfig(PretrainedConfig):
     def __init__(
         self,
         act_fn="relu",
-        codebook_dimension=2048,
+        nb_discrete_codes=2048,
         commit=0.02,
         conv_input_shape=1,
         conv_res_scale=False,
@@ -493,7 +447,7 @@ def __init__(
         # VQVAE parameters (all used)
         self.levels = levels
         self.embed_dim = embed_dim
-        self.codebook_dimension = codebook_dimension
+        self.nb_discrete_codes = nb_discrete_codes
         self.res_conv_width = res_conv_width
         self.res_conv_depth = res_conv_depth
         self.res_convolution_multiplier = res_convolution_multiplier
@@ -535,7 +489,7 @@ class JukeboxConfig(PretrainedConfig):
     [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
 
 
-    The downsampling and stride are used to determine downsampling of the input sequence. For example, downsamoling =
+    The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
     (5,3), and strides = (2, 2) will downsample the audio by 2**5 = 32 to get the first level of codes, and 2**8 = 256
     to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
 
@@ -545,7 +499,7 @@ class JukeboxConfig(PretrainedConfig):
         prior_config_list (`List[`JukeboxPriorConfig`]`, *optional*, defaults to None):
             _description_
         nb_priors (`int`, *optional*, defaults to 3):
-            Number of prior models that will sequentialy sample tokens. Each prior is conditional auto regressive
+            Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
             (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
             trained using a top prior and 2 upsampler priors.
         sampling_rate (`int`, *optional*, defaults to 44100):
@@ -553,7 +507,7 @@ class JukeboxConfig(PretrainedConfig):
         timing_dims (`int`, *optional*, defaults to 64):
             Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
             layer. The timing embedding layer converts the absolute and relative position in the currently sampled
-            audio to a tensor of lenght `timing_dims` that will be added to the music tokens.
+            audio to a tensor of length `timing_dims` that will be added to the music tokens.
         min_duration (`int`, *optional*, defaults to 0):
             Minimum duration of the audios to generate
         max_duration (`float`, *optional*, defaults to 600.0):
@@ -564,7 +518,7 @@ class JukeboxConfig(PretrainedConfig):
             Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
             duration.
         init_std (`float`, *optional*, defaults to 0.2):
-            Standard deviation used to inital the model.
+            Standard deviation used to initial the model.
 
     Example:
 
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index cc7380d735df5..490a2f41ed78b 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -77,7 +77,7 @@ def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
 def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
     """
     Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
-    returned. If the provided token sequence is smaller, it will be padded, othewise, only characters ranging from the
+    returned. If the provided token sequence is smaller, it will be padded, otherwise, only characters ranging from the
     midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
     the most relevant tokens (in time) for the sequence.
 
@@ -90,7 +90,7 @@ def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, off
             Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
             account
         duration (`int`):
-            Expected duration of the generated music, in samples. The duration has to be smaller than the total lenght,
+            Expected duration of the generated music, in samples. The duration has to be smaller than the total length,
             which represent the overall length of the signal,
     """
     full_tokens = full_tokens[0]
@@ -175,7 +175,7 @@ def get_alignment(music_tokens, labels, prior, config):
     return alignments
 
 
-def save_temp_audio(fname, lvl, metas, aud, sampling_rate):
+def save_temp_audio(fname, lvl, metas, aud):
     aud = torch.clamp(aud, -1, 1).cpu().numpy()
     for i in list(range(aud.shape[0])):
         if metas is not None:
@@ -377,55 +377,55 @@ def forward(self, hidden_states, all_levels=True):
 class JukeboxBottleneckBlock(nn.Module):
     def __init__(self, config: JukeboxVQVAEConfig):
         super().__init__()
-        self.codebook_dim = config.codebook_dimension
+        self.nb_discrete_codes = config.nb_discrete_codes
         self.codebook_width = config.embed_dim
         self.mu = config.lmu
         self.threshold = 1.0
         self.init = False
         self.codebook_sum = None
         self.codebook_elem = None
-        self.register_buffer("codebook", torch.zeros(self.codebook_dim, self.codebook_width))
+        self.register_buffer("codebook", torch.zeros(self.nb_discrete_codes, self.codebook_width))
 
     def _tile(self, hidden_states):
         dim, embed_width = hidden_states.shape
-        if dim < self.codebook_dim:
-            n_repeats = (self.codebook_dim + dim - 1) // dim
+        if dim < self.nb_discrete_codes:
+            n_repeats = (self.nb_discrete_codes + dim - 1) // dim
             std = 0.01 / np.sqrt(embed_width)
             hidden_states = hidden_states.repeat(n_repeats, 1)
             hidden_states = hidden_states + torch.randn_like(hidden_states) * std
         return hidden_states
 
     def init_codebook(self, hidden_states):
-        codebook_dim = self.codebook_dim
+        nb_discrete_codes = self.nb_discrete_codes
         self.init = True
         # init k_w using random vectors from hidden_states codebook_w (index w?)
         codes = self._tile(hidden_states)
-        self.codebook = codes[torch.randperm(codes.shape[0])][:codebook_dim]
+        self.codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes]
         self.codebook_sum = self.codebook
-        self.codebook_elem = torch.ones(codebook_dim, device=self.codebook.device)
+        self.codebook_elem = torch.ones(nb_discrete_codes, device=self.codebook.device)
 
     def update_codebook(self, hidden_states, latent_states):
-        mu, codebook_width, codebook_dim = self.mu, self.codebook_width, self.codebook_dim
+        mu, codebook_width, nb_discrete_codes = self.mu, self.codebook_width, self.nb_discrete_codes
         with torch.no_grad():
             # Calculate new centres
             latent_states_onehot = torch.zeros(
-                codebook_dim, hidden_states.shape[0], device=hidden_states.device
-            )  # codebook_dim, batch_size * L
+                nb_discrete_codes, hidden_states.shape[0], device=hidden_states.device
+            )  # nb_discrete_codes, batch_size * L
             latent_states_onehot.scatter_(0, latent_states.view(1, hidden_states.shape[0]), 1)
 
-            _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)  # codebook_dim, w
-            _codebook_elem = latent_states_onehot.sum(dim=-1)  # codebook_dim
+            _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)  # nb_discrete_codes, w
+            _codebook_elem = latent_states_onehot.sum(dim=-1)  # nb_discrete_codes
             codes = self._tile(hidden_states)
-            _random_codebook = codes[torch.randperm(codes.shape[0])][:codebook_dim]
+            _random_codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes]
 
             # Update centres
             old_codebook = self.codebook
-            self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum  # w, codebook_dim
-            self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem  # codebook_dim
-            usage = (self.codebook_elem.view(codebook_dim, 1) >= self.threshold).float()
+            self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum  # w, nb_discrete_codes
+            self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem  # nb_discrete_codes
+            usage = (self.codebook_elem.view(nb_discrete_codes, 1) >= self.threshold).float()
             self.codebook = (
                 usage
-                * (self.codebook_sum.view(codebook_dim, codebook_width) / self.codebook_elem.view(codebook_dim, 1))
+                * (self.codebook_sum.view(nb_discrete_codes, codebook_width) / self.codebook_elem.view(nb_discrete_codes, 1))
                 + (1 - usage) * _random_codebook
             )
             _codebook_prob = _codebook_elem / torch.sum(
@@ -442,7 +442,7 @@ def preprocess(self, hidden_states):
         hidden_states = hidden_states.permute(0, 2, 1).contiguous()
         hidden_states = hidden_states.view(
             -1, hidden_states.shape[-1]
-        )  # x_en = (batch_size *L, w), k_j = (w, codebook_dim)
+        )  # x_en = (batch_size *L, w), k_j = (w, nb_discrete_codes)
 
         if hidden_states.shape[-1] == self.codebook_width:
             prenorm = torch.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(np.prod(hidden_states.shape))
@@ -617,7 +617,7 @@ def __init__(self, config: JukeboxVQVAEConfig):
             ) * top_raw_to_tokens
             config.sample_length = config.sample_length.astype(int)
 
-        self.codebook_dim = config.codebook_dimension
+        self.nb_discrete_codes = config.nb_discrete_codes
         self.commit = config.commit
         self.sample_length = config.sample_length
 
@@ -662,13 +662,13 @@ def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1) -> to
         Args:
             music_tokens (`torch.LongTensor`):
                 Tensor of music tokens which will be decoded to raw audio by using the codebook. Each music token
-                should be an index to a coresponding `code` vector in the codebook.
+                should be an index to a corresponding `code` vector in the codebook.
             start_level (`int`, *optional*):
                 Level at which the decoding process will start. Default to 0.
             end_level (`int`, *optional*):
                 Level at which the decoding process will start. Default to None.
             bs_chunks (int, *optional*):
-                Number of chuncks to process at the same time.
+                Number of chunks to process at the same time.
         """
         token_chunks = [torch.chunk(token, bs_chunks, dim=0) for token in music_tokens]
         dequantised_states = []
@@ -704,7 +704,7 @@ def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
             end_level (`int`, *optional*):
                 Level at which the encoding process will start. Default to None.
             bs_chunks (int, *optional*):
-                Number of chuncks of raw audio to process at the same time.
+                Number of chunks of raw audio to process at the same time.
         """
         audio_chunks = torch.chunk(input_audio, bs_chunks, dim=0)
         music_tokens_list = []
@@ -716,7 +716,7 @@ def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
 
     def sample(self, n_samples):
         music_tokens = [
-            torch.randint(0, self.codebook_dim, size=(n_samples, *music_tokens_shape), device="cpu")
+            torch.randint(0, self.nb_discrete_codes, size=(n_samples, *music_tokens_shape), device="cpu")
             for music_tokens_shape in self.music_tokens_shapes
         ]
         return self.decode(music_tokens)
@@ -734,7 +734,7 @@ def forward(self, raw_audio):
 
 
         Returns:
-            `Tuple[torch.Tensor, torch.Tensoor`
+            `Tuple[torch.Tensor, torch.Tensor`
 
 
         Example:
@@ -774,7 +774,7 @@ class JukeboxMLP(nn.Module):
     def __init__(self, config):
         # a single channel is always used in original code
         super().__init__()
-        embed_dim = config.width
+        embed_dim = config.hidden_size
         hidden_dim = int(config.mlp_multiplier * embed_dim)
 
         self.c_fc = JukeboxConv1D(embed_dim, hidden_dim)
@@ -804,27 +804,21 @@ def forward(self, input):
 
 
 class JukeboxAttention(nn.Module):
-    def __init__(
-        self,
-        config,
-        n_ctx,
-        attn_func="dense_attn",
-        encoder_len=None,
-    ):
+    def __init__(self,config,n_ctx,attn_func="dense_attn",encoder_len=None):
         super().__init__()
-        self.embed_dim = config.width
+        self.embed_dim = config.hidden_size
         self.n_heads = config.n_heads
         self.dropout = config.attn_dropout
         hidden_dim = int(config.attention_multiplier * self.embed_dim)
 
         self.head_dim = hidden_dim // config.n_heads
-        self.n_ctx = n_ctx  # NOTE: n_ctx could be different within operations. This is complete n_ctx
+        self.n_ctx = n_ctx
         self.hidden_dim = hidden_dim
         self.scale = self.head_dim**-0.25
         self.mask = config.mask
 
         if attn_func == "cross_attention":
-            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim)  # issue here, for single enc decoder different
+            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim)
             self.c_enc_kv = JukeboxConv1D(self.embed_dim, hidden_dim * 2)
         else:
             self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim * 3)
@@ -903,21 +897,21 @@ def merge_heads(self, hidden_states):
         new_hidden_states_shape = (*hidden_states.size()[:-2], hidden_states.size(-2) * hidden_states.size(-1))
         return hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct merge_states
 
-    def split_heads(self, hidden_states, k=False):
+    def split_heads(self, hidden_states, is_key=False):
         new_hidden_states_shape = (
             *hidden_states.size()[:-1],
             self.n_heads,
             hidden_states.size(-1) // self.n_heads,
         )
         hidden_states = hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct split_states
-        if k:
+        if is_key:
             return hidden_states.permute(0, 2, 3, 1)
         else:
             return hidden_states.permute(0, 2, 1, 3)
 
     def dense_attn(self, query, key, value, sample):
         query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
+        key = self.split_heads(key, is_key=True)
         value = self.split_heads(value)
         context_states = self._attn(query, key, value, sample)
         context_states = self.merge_heads(context_states)
@@ -1142,13 +1136,14 @@ def _suff_cache_len(self):
             key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
             location in the context.
         """
+        previous_block_length = (self.sample_t - 1) % self.block_ctx + 1 + self.block_ctx
         REQUIRED_CACHE_LEN = {
             "dense_attn": self.sample_t,
             "block_attn": (self.sample_t - 1) % self.block_ctx + 1,
             "transpose_block_attn": self.sample_t,
             "prev_block_attn": self.sample_t
             if self.sample_t <= self.block_ctx
-            else (self.sample_t - 1) % self.block_ctx + 1 + self.block_ctx,
+            else previous_block_length,
             "cross_attn": self.encoder_len,
             "prime_attn": min(self.sample_t, self._encoder_len),
         }
@@ -1185,15 +1180,9 @@ def del_cache(self):
 
 
 class JukeboxBlock(nn.Module):
-    def __init__(
-        self,
-        config,
-        n_ctx,
-        attn_func="dense_attn",
-        encoder_len=None,
-    ):
+    def __init__(self,config,n_ctx,attn_func="dense_attn",encoder_len=None):
         super().__init__()
-        self.width = config.width
+        self.width = config.hidden_size
         self.attn = JukeboxAttention(
             config=config,
             n_ctx=n_ctx,
@@ -1201,9 +1190,9 @@ def __init__(
             encoder_len=encoder_len,
         )
 
-        self.layer_norm_0 = JukeboxLayerNorm(config.width)
+        self.layer_norm_0 = JukeboxLayerNorm(config.hidden_size)
         self.mlp = JukeboxMLP(config)
-        self.layer_norm_1 = JukeboxLayerNorm(config.width)
+        self.layer_norm_1 = JukeboxLayerNorm(config.hidden_size)
         self.res_scale = 1.0 / config.depth if config.attn_res_scale else 1.0
         self.attn_func = attn_func
 
@@ -1222,15 +1211,10 @@ def forward(self, hidden_states, last_encoder_hidden_states, sample=False):
 
 
 class JukeboxLayerStack(nn.Module):
-    def __init__(
-        self,
-        config,
-        n_ctx,
-        encoder_len=None,
-    ):
+    def __init__(self,config,n_ctx,encoder_len=None):
         super().__init__()
         self.n_ctx = n_ctx
-        self.width = config.width
+        self.width = config.hidden_size
         self.depth = config.depth
         self.blocks = config.blocks
         self.attention_pattern = config.attention_pattern
@@ -1308,28 +1292,40 @@ def __init__(
         is_encoder=False,
     ):
         """
-        - embed_dim : either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook
-        dimension, if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
-        for the lyric tokens. The width corresponds to the number of tokens or lyrics tokens provided in a single pass.
-        It can be different from the embed dim.
-        - audio_conditioning : whether or not the prior supports conditionning on audio.
-        - metadata_conditioning : whether or not the prior supports conditionning on artitst, genres, lyrics and
-          timing. When
-        False, the start token is random.
-        - encoder_len : for now len of the lyric hidden states
+        _summary_
+
+        Args:
+            config (`JukeboxPriorConfig`):
+                Model configuration class with all the parameters of the model.
+                Initializing with a config file does not load the weights associated with the model, only the
+                configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+            n_ctx (`int`, *optional*, defaults to `None)`:
+                number of tokens or lyrics tokens provided in a single pass.
+            embed_dim (`int`, *optional*, defaults to `None)`:
+                either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
+                if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
+            audio_conditioning (`bool`, *optional*, defaults to `False)`:
+                whether or not the prior supports conditionning on audio.
+            metadata_conditioning (`bool`, *optional*, defaults to `False)`:
+                whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
+            encoder_len (`int`, *optional*, defaults to `0)`:
+                length of the encoder #TODO this is related to n_vocab
+            is_encoder (`bool`, *optional*, defaults to `False)`:
+                _description_
         """
+
         super().__init__()
-        self.width = config.width
+        self.width = config.hidden_size
         self.depth = config.depth
         self.n_ctx = n_ctx if n_ctx is not None else config.n_ctx
-        self.embed_dim = embed_dim if embed_dim is not None else config.embed_dim
-        self.embed_tokens = nn.Embedding(self.embed_dim, config.width)
+        self.embed_dim = embed_dim if embed_dim is not None else config.music_vocab_size
+        self.embed_tokens = nn.Embedding(self.embed_dim, config.hidden_size)
         self.embed_tokens_dropout = nn.Dropout(config.emb_dropout)
         self.metadata_conditioning = metadata_conditioning
         self.audio_conditioning = audio_conditioning
         if not metadata_conditioning:
-            self.start_token = nn.Parameter(torch.empty((1, config.width)))
-        self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.width)
+            self.start_token = nn.Parameter(torch.empty((1, config.hidden_size)))
+        self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.hidden_size)
         self.pos_emb_dropout = nn.Dropout(config.emb_dropout)
 
         self.transformer = JukeboxLayerStack(config, n_ctx=self.n_ctx, encoder_len=encoder_len)
@@ -1345,18 +1341,11 @@ def __init__(
             self.share_embed_tokens_fc_proj_out = True
 
         if not is_encoder:
-            self.fc_proj_out = nn.Linear(config.width, self.embed_dim, bias=False)
+            self.fc_proj_out = nn.Linear(config.hidden_size, self.embed_dim, bias=False)
             if self.share_embed_tokens_fc_proj_out:
                 self.fc_proj_out.weight = self.embed_tokens.weight
             self.loss = torch.nn.CrossEntropyLoss()
 
-    def postprocess(self, tokens, sample_tokens=None):
-        # Convert back from NL and long to NHWC
-        batch_size = tokens.shape[0]
-        if sample_tokens is None or sample_tokens == self.embed_dim:
-            return tokens.view(batch_size, *self.embed_dim)
-        else:
-            return tokens.view(batch_size, -1)
 
     def forward(
         self,
@@ -1369,7 +1358,9 @@ def forward(
         get_sep_loss=False,
     ):
         """
-        - tokens : composed of both music tokens and lyrics tokens or just music tokens
+        Args:
+
+        tokens : composed of both music tokens and lyrics tokens or just music tokens
         depending on the `merged_decoder` flag.
         """
         # Preprocess.
@@ -1386,9 +1377,8 @@ def forward(
 
         target = tokens  # Target
         hidden_states = self.embed_tokens(tokens)  # music_tokens embedding
-        hidden_states = torch.cat(
-            (hidden_states[:, -1:], hidden_states[:, :-1]), dim=1
-        )  # Shift by 1, and fill in start token
+        # Shift by 1, and fill in start token
+        hidden_states = torch.cat((hidden_states[:, -1:], hidden_states[:, :-1]), dim=1)
         if self.metadata_conditioning:
             hidden_states[:, 0] = metadata_conditioning.view(batch_size, self.width)
         else:
@@ -1443,9 +1433,8 @@ def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_cond
             cond = audio_conditioning[:, sample_t : sample_t + 1, :]
         else:
             cond = audio_conditioning
-        hidden_states = (
-            hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond
-        )  # Pos emb, dropout is identity at eval time
+        # Pos emb, dropout is identity at eval time
+        hidden_states = (hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond)
         return hidden_states, cond
 
     def sample(
@@ -1492,9 +1481,8 @@ def sample(
                 # Adjust logits
                 hidden_states = hidden_states / temp
                 hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
-                tokens = torch.distributions.Categorical(
-                    logits=hidden_states
-                ).sample()  # Sample and replace hidden_states
+                # Sample and replace hidden_states
+                tokens = torch.distributions.Categorical(logits=hidden_states).sample()
                 sampled_tokens.append(tokens.clone())
 
             del tokens
@@ -1503,7 +1491,7 @@ def sample(
             tokens = torch.cat(sampled_tokens, dim=1)
             if get_preds:
                 preds = torch.cat(preds, dim=1)
-            tokens = self.postprocess(tokens, sample_tokens)
+            # tokens = self.postprocess(tokens, sample_tokens)
         if get_preds:
             return tokens, preds
         else:
@@ -1529,7 +1517,7 @@ def primed_sample(
         sample_tokens=None,
     ):
         if sample_tokens is None:
-            sample_tokens = self.embed_dim
+            sample_tokens = self.n_ctx
         # Preprocess.
         batch_size = lyric_and_music_tokens.shape[0]
         with torch.no_grad():
@@ -1589,16 +1577,13 @@ def primed_sample(
             # the input of the encoder and decoder can be merged into (lyrics, music tokens)
             input_tokens = sampled_audio[-1]
 
-            iter = tqdm(range(len(sampled_audio), sample_tokens), leave=False)
-            for sample_t in iter:
-                iter.set_description(f"Primed sampling {len(iter)} music tokens", refresh=True)
+            itererator = tqdm(range(len(sampled_audio), sample_tokens), desc = f"Sampling {len(range(len(sampled_audio), sample_tokens))} music tokens", leave=False)
+            for sample_t in itererator:
                 hidden_states, cond = self.get_emb(
                     sample_t, n_samples, input_tokens, audio_conditioning, metadata_conditioning
                 )
 
-                hidden_states = self.transformer(
-                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True
-                )  # Transformer
+                hidden_states = self.transformer(hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True)
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
                 hidden_states = self.fc_proj_out(hidden_states)  # Predictions
@@ -1618,7 +1603,7 @@ def primed_sample(
             music_tokens = torch.cat(sampled_audio, dim=1)
             if get_preds:
                 preds = torch.cat(preds, dim=1)
-            music_tokens = self.postprocess(music_tokens, sample_tokens)
+            # music_tokens = self.postprocess(music_tokens, sample_tokens)
         if get_preds:
             return music_tokens, preds
         else:
@@ -1627,38 +1612,34 @@ def primed_sample(
 
 class JukeboxMusicTokenConditioner(nn.Module):
     """
-    The JukeboxMusicTokenConditioner takes music tokens as an input (coresponding to vocabularies in the VQ-VAE
-    codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQ-VAE).
-
-    The embedding layer is different from the vaqvae's bottleneck
-
+    The `JukeboxMusicTokenConditioner` takes music tokens as an input (coresponding to the codes of the VQVAE's
+    codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE).
     """
 
     def __init__(self, config, level):
 
         super().__init__()
-        self.embed_tokens = nn.Embedding(config.embed_dim, config.width)
+        self.embed_tokens = nn.Embedding(config.music_vocab_size, config.hidden_size)
+        config.embed_dim = config.music_vocab_size # setting correct argument for the `JukeboxDecoder`
 
-        # JukeboxMusicTokenConditioner, takes as input either uper level tokens, upsamples them to feed them to the next level?
         self.upsampler = JukeboxDecoderConvBock(
             config,
-            config.width,
+            config.hidden_size,
             config.res_conv_width,
             config.res_conv_depth,
             config.res_downs_t[level],
             config.res_strides_t[level],
             reverse_dilation=False,
         )
-        self.layer_norm = JukeboxLayerNorm(config.width)
+        self.layer_norm = JukeboxLayerNorm(config.hidden_size)
 
     def forward(self, music_tokens, raw_audio_conditionning=None):
         """
         Args :
             music_tokens (`torch.LongTensor`):
-                Music tokens form the uper level in range(codebook_dim)
+                Music tokens form the uper level in range(nb_discrete_codes)
             raw_audio_conditionning (`torch.LongTensor`):
-                Audio used when primed sampling, raw audio information that conditions
-            the generation
+                Audio used when primed sampling, raw audio information that conditions the generation
         """
         if raw_audio_conditionning is None:
             raw_audio_conditionning = 0.0
@@ -1717,9 +1698,8 @@ def forward(self, pos_start, pos_end=None):
 
         # Bin each value to bins_
         normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min)  # [0,1)
-        bins_ = (
-            (self.embed_dim * normalised_position).floor().long().detach()
-        )  # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1]
+        # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1
+        bins_ = (self.embed_dim * normalised_position).floor().long().detach()
         return self.emb(bins_)
 
 
@@ -1727,7 +1707,7 @@ class LabelConditioner(nn.Module):
     def __init__(self, config, include_time_signal):
         super().__init__()
 
-        embed_dim = config.width
+        embed_dim = config.hidden_size
         timing_dims = config.timing_dims
         sampling_rate = config.sampling_rate
         nb_genres, nb_artists = config.metadata_dims
@@ -1781,32 +1761,31 @@ def forward(self, metadata):
 
 class JukeboxPrior(PreTrainedModel):
     """
-    Model the prior on vq codes conditioned on timing, artist, genre, lyrics and codes from levels above. To condition
-    on the timing, genre and artist, we use the LabelConditioner class To condition on the codes from the level above,
-    we use the JukeboxMusicTokenConditioner class To condition on lyrics, we allow two types of priors:
-    - Separate Encoder Decoder: This is the usual encoder-decoder style transformer. The encoder transformer
-      autoregressively
-    models the lyrics, and we use its last layer to produce keys/values that are attened to by the decoder transformer
-    - Single Encoder Decoder: This is a simplification where we combine them into a single model. We merge the text
-      vocab
-    and VQ vocab into a single large vocab, and the lyric tokens and VQ tokens into a single longer sequence of tokens
-    which we autoregressively model together.
-
-    Question : why are the embeddings from the vq-vae not used? Or am I crazy? In the forward it is used, but not in
-    the primed sample or sample functions. If the model is not trained using these/ uses the forward differently then I
-    guess it is fine but otherwise it looks strange.
-    """
+    The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be seen
+    as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù is defined,
+    it also models the `next character` prediction on the lyrics. Can be conditionned on timing, artist, genre, lyrics and codes
+    from lower-levels Priors.
 
+    Args:
+        config (`JukeboxPriorConfig`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        level (`int`):
+            Current level of the Prior. Should be in range `0,nb_priors`.
+        nb_priors (`int`, *optional*, defaults to `3)`:
+            Total number of priors.
+        vqvae_encoder (`Callable`, *optional*, defaults to `None)`:
+            Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions
+            instead of the vqvae module to avoid getting the parameters.
+        vqvae_decoder (`Callable`, *optional*, defaults to `None)`:
+            Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions
+            instead of the vqvae module to avoid getting the parameters.
+    """
     config_class = JukeboxPriorConfig
 
-    def __init__(
-        self,
-        config: JukeboxPriorConfig,
-        level,
-        nb_priors=3,
-        vqvae_encoder=None,
-        vqvae_decoder=None,
-    ):
+    def __init__(self,config: JukeboxPriorConfig,level,nb_priors=3,vqvae_encoder=None,vqvae_decoder=None):
+
         super().__init__(config)
         # Passing functions instead of the vqvae module to avoid getting params, only used in the
         # forward loop
@@ -1838,15 +1817,15 @@ def __init__(
         if config.is_encoder_decoder:
             # encoder-decoder transformer
             self.input_shapes = [config.nb_relevant_lyric_tokens, config.n_ctx]
-            self.embed_dim_shift = [0, config.encoder_n_vocab]
-            self.width = config.width
+            self.embed_dim_shift = [0, config.lyric_vocab_size]
+            self.width = config.hidden_size
 
             self.lyrics_enc_loss_dims = config.nb_relevant_lyric_tokens
 
             self.prior = JukeboxConditionalAutoregressive(
                 config,
                 n_ctx=config.nb_relevant_lyric_tokens + config.n_ctx,
-                embed_dim=config.encoder_n_vocab + config.embed_dim,
+                embed_dim=config.lyric_vocab_size + config.music_vocab_size,
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
                 metadata_conditioning=True,
                 encoder_len=self.lyrics_enc_loss_dims,
@@ -1855,13 +1834,13 @@ def __init__(
         else:
             # Separate encoder-decoder transformer
             #  we have to modify the config to use the encoder variables for the lyric encoder
-            encoder_config = self._get_encoder_config(config)
+            encoder_config = config.encoder_config
 
             if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
                 self.lyrics_enc_loss_dims = self.nb_relevant_lyric_tokens
-                self.lyric_acts_width = encoder_config.width
-                self.encoder_width = config.width
-                self.encoder_dim = encoder_config.encoder_n_vocab
+                self.lyric_acts_width = encoder_config.hidden_size
+                self.encoder_width = config.hidden_size
+                self.encoder_dim = config.lyric_vocab_size
                 self.encoder = JukeboxConditionalAutoregressive(
                     encoder_config,
                     n_ctx=self.nb_relevant_lyric_tokens,
@@ -1870,9 +1849,9 @@ def __init__(
                     metadata_conditioning=False,
                     is_encoder=True,
                 )
-                self.encoder.proj_in = JukeboxConv1D(encoder_config.width, config.width)
-                self.encoder.final_layer_norm = JukeboxLayerNorm(config.width)
-                self.encoder.lm_head = nn.Linear(config.width, encoder_config.encoder_n_vocab, bias=False)
+                self.encoder.proj_in = JukeboxConv1D(encoder_config.hidden_size, config.hidden_size)
+                self.encoder.final_layer_norm = JukeboxLayerNorm(config.hidden_size)
+                self.encoder.lm_head = nn.Linear(config.hidden_size, config.lyric_vocab_size, bias=False)
             else:
                 self.lyrics_enc_loss_dims = 0
 
@@ -1896,27 +1875,6 @@ def __init__(
             f" length:{self.sample_length}"
         )
 
-    def _get_encoder_config(self, config):
-        # Set config to use the lyric encoder parameters
-        encoder_config = copy.deepcopy(config)
-        encoder_config.attn_dropout = config.encoder_attn_dropout
-        encoder_config.attention_pattern = config.encoder_attention_pattern
-        encoder_config.blocks = config.encoder_blocks
-        encoder_config.depth = config.encoder_depth
-        encoder_config.emb_dropout = config.encoder_emb_dropout
-        encoder_config.heads = config.encoder_heads
-        encoder_config.init_scale = config.encoder_init_scale
-        encoder_config.loss_fraction = config.encoder_loss_fraction
-        encoder_config.attention_multiplier = config.encoder_attention_multiplier
-        encoder_config.mlp_multiplier = config.encoder_mlp_multiplier
-        encoder_config.resid_dropout = config.encoder_resid_dropout
-        encoder_config.attn_res_scale = config.encoder_attn_res_scale
-        encoder_config.spread = config.encoder_spread
-        encoder_config.width = config.encoder_width
-        encoder_config.zero_out = config.encoder_zero_out
-        encoder_config.n_vocab = config.encoder_n_vocab
-        return encoder_config
-
     def get_metadata(self, labels, start, total_length, offset, get_indices=False):
         metadata = labels.clone()
         metadata[:, 0] = total_length
@@ -1978,7 +1936,7 @@ def get_music_tokens_conds(self, music_tokens, start, end):
     def prior_preprocess(self, tokens, conds):
         """
         Shifts the input tokens to account for the dictionnary merge. The embed_dim_shift give by how much the music
-        tokens should be shifted by. It is equal to encoder_n_vocab.
+        tokens should be shifted by. It is equal to lyric_vocab_size.
         """
         batch_size = tokens[0].shape[0]
         for i in range(len(tokens)):
@@ -2033,7 +1991,7 @@ def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
             end_level = self.levels
         # Get latents
         with torch.no_grad():
-            latent_states = self.encoder(
+            latent_states = self.vqvae_encoder(
                 hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks
             )
         return latent_states
@@ -2047,7 +2005,7 @@ def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
         if end_level is None:
             end_level = self.levels
         with torch.no_grad():
-            output = self.decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+            output = self.vqvae_decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
         return output
 
     def get_cond(self, music_tokens_conds, metadata):
@@ -2373,7 +2331,7 @@ def split_batch(self, obj, n_samples, split_size):
             raise TypeError("Unknown input type")
 
     # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
-    def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample):
+    def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample, max_batch_size):
         prior = self.priors[level]
         sampled_tokens = music_tokens[level]
         n_ctx = prior.n_ctx
@@ -2385,10 +2343,10 @@ def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, l
             sampling_kwargs["sample_tokens"] = n_ctx
             start = nb_sampled_tokens - n_ctx + tokens_to_sample
 
-        return self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start)
+        return self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size)
 
     # Sample a single window of length=n_ctx at position=start on level=level
-    def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start):
+    def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size):
         prior = self.priors[level]
         n_samples = music_tokens[0].shape[0]
         n_ctx = prior.n_ctx
@@ -2417,6 +2375,7 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
             # Nothing new to sample
             return music_tokens
 
+
         # get music_tokens_conds from level above
         music_tokens_conds = prior.get_music_tokens_conds(music_tokens, start, end)
         # if there are no levels above should return None!
@@ -2424,15 +2383,16 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         # set metadata offset, sample_length and lyrics tokens
         metadata = prior.get_metadata(labels, start, self.total_length, offset)
 
-        max_batch_size = sampling_kwargs.pop("max_batch_size")
-
         music_tokens_list = self.split_batch(previous_sampled_tokens, n_samples, max_batch_size)
         music_tokens_conds_list = self.split_batch(music_tokens_conds, n_samples, max_batch_size)
         metadata_list = self.split_batch(metadata, n_samples, max_batch_size)
         tokens = []
         iterator = tqdm(zip(music_tokens_list, music_tokens_conds_list, metadata_list), leave=False)
         for music_tokens_i, music_tokens_conds_i, metadata_i in iterator:
-            iterator.set_description(f"Sampling windows of {sample_tokens}")
+            name = ["Ancestral", "Primed"][music_tokens_i.shape[1] == 0]
+            iterator.set_description(
+                    f"[prior level {level}] {name} Sampling {sample_tokens} tokens out of {self.total_length//prior.raw_to_tokens}", refresh=True
+            )
             tokens_i = prior.sample(
                 n_samples=music_tokens_i.shape[0],
                 music_tokens=music_tokens_i,
@@ -2449,18 +2409,15 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         return music_tokens
 
     # Sample total_length tokens at level=level with hop_length=hop_length
-    def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length):
+    def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length, max_batch_size):
         if total_length >= self.priors[level].n_ctx:
-            iterator = tqdm(get_starts(total_length, self.priors[level].n_ctx, hop_length), leave=False)
-            for start in get_starts(total_length, self.priors[level].n_ctx, hop_length):
-                iterator.set_description(
-                    f"[prior level {level}] Sampling {self.priors[level].n_ctx}/{total_length} tokens", refresh=True
-                )
-                music_tokens = self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start)
+            iterator = get_starts(total_length, self.priors[level].n_ctx, hop_length)
+            for start in iterator:
+                music_tokens = self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size)
 
         else:
             music_tokens = self.sample_partial_window(
-                music_tokens, labels, offset, sampling_kwargs, level, total_length
+                music_tokens, labels, offset, sampling_kwargs, level, total_length, max_batch_size
             )
         return music_tokens
 
@@ -2558,13 +2515,11 @@ def _sample(
         if sample_levels is None:
             sample_levels = range(len(self.priors))
 
-        self.total_length = (
-            total_length  # total length of the signal, might be bit different from the actual generated length
-        )
+        # total length of the signal, might be bit different from the actual generated length
+        self.total_length = total_length
         for level in sample_levels:
             sampling_kwargs = dict(
                 temp=0.99 if level == len(self.priors) - 1 else sampling_temperature,
-                max_batch_size=lower_batch_size if level != sample_levels else max_batch_size,
                 chunk_size=chunk_size,
                 sample_tokens=sample_tokens,
             )
@@ -2572,9 +2527,9 @@ def _sample(
 
             total_token_to_sample = total_length // self.priors[level].raw_to_tokens
             hop_length = int(self.config.hop_fraction[level] * self.priors[level].n_ctx)
-
+            max_batch_size = lower_batch_size if level != sample_levels else max_batch_size
             music_tokens = self.sample_level(
-                music_tokens, labels[level], offset, sampling_kwargs, level, total_token_to_sample, hop_length
+                music_tokens, labels[level], offset, sampling_kwargs, level, total_token_to_sample, hop_length, max_batch_size
             )
 
             if save_results:
@@ -2588,7 +2543,7 @@ def _sample(
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
                 save_temp_audio(
-                    logdir, level, metas=metas, aud=raw_audio.float(), sampling_rate=self.config.sampling_rate
+                    logdir, level, metas=metas, aud=raw_audio.float()
                 )
                 if compute_alignments and self.priors[0] is not None and self.priors[0].nb_relevant_lyric_tokens > 0:
                     with torch.no_grad():

From 19f49b6fb444118f90e4a0b912abee96aad82fdc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 11:20:01 +0000
Subject: [PATCH 177/196] fixup

---
 .../models/jukebox/configuration_jukebox.py   |  3 +-
 .../models/jukebox/modeling_jukebox.py        | 72 ++++++++++++-------
 2 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index a37f41acc0d7e..6d14e972492e7 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -305,7 +305,7 @@ def __init__(
         self.music_vocab_size = music_vocab_size
         if encoder_config is not None:
             self.encoder_config = JukeboxPriorConfig(**encoder_config)
-        else :
+        else:
             self.encoder_config = None
         self.encoder_loss_fraction = encoder_loss_fraction
         self.init_scale = init_scale
@@ -365,6 +365,7 @@ def to_dict(self):
         output["model_type"] = self.__class__.model_type
         return output
 
+
 class JukeboxVQVAEConfig(PretrainedConfig):
     """
     This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index a40e3aca03c44..2a5bf2cae73e8 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """PyTorch Jukebox model."""
 
-import copy
 import math
 import os
 from typing import List
@@ -425,7 +424,10 @@ def update_codebook(self, hidden_states, latent_states):
             usage = (self.codebook_elem.view(nb_discrete_codes, 1) >= self.threshold).float()
             self.codebook = (
                 usage
-                * (self.codebook_sum.view(nb_discrete_codes, codebook_width) / self.codebook_elem.view(nb_discrete_codes, 1))
+                * (
+                    self.codebook_sum.view(nb_discrete_codes, codebook_width)
+                    / self.codebook_elem.view(nb_discrete_codes, 1)
+                )
                 + (1 - usage) * _random_codebook
             )
             _codebook_prob = _codebook_elem / torch.sum(
@@ -802,7 +804,7 @@ def forward(self, input):
 
 
 class JukeboxAttention(nn.Module):
-    def __init__(self,config,n_ctx,attn_func="dense_attn",encoder_len=None):
+    def __init__(self, config, n_ctx, attn_func="dense_attn", encoder_len=None):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.n_heads = config.n_heads
@@ -1139,9 +1141,7 @@ def _suff_cache_len(self):
             "dense_attn": self.sample_t,
             "block_attn": (self.sample_t - 1) % self.block_ctx + 1,
             "transpose_block_attn": self.sample_t,
-            "prev_block_attn": self.sample_t
-            if self.sample_t <= self.block_ctx
-            else previous_block_length,
+            "prev_block_attn": self.sample_t if self.sample_t <= self.block_ctx else previous_block_length,
             "cross_attn": self.encoder_len,
             "prime_attn": min(self.sample_t, self._encoder_len),
         }
@@ -1178,7 +1178,7 @@ def del_cache(self):
 
 
 class JukeboxBlock(nn.Module):
-    def __init__(self,config,n_ctx,attn_func="dense_attn",encoder_len=None):
+    def __init__(self, config, n_ctx, attn_func="dense_attn", encoder_len=None):
         super().__init__()
         self.width = config.hidden_size
         self.attn = JukeboxAttention(
@@ -1209,7 +1209,7 @@ def forward(self, hidden_states, last_encoder_hidden_states, sample=False):
 
 
 class JukeboxLayerStack(nn.Module):
-    def __init__(self,config,n_ctx,encoder_len=None):
+    def __init__(self, config, n_ctx, encoder_len=None):
         super().__init__()
         self.n_ctx = n_ctx
         self.width = config.hidden_size
@@ -1344,7 +1344,6 @@ def __init__(
                 self.fc_proj_out.weight = self.embed_tokens.weight
             self.loss = torch.nn.CrossEntropyLoss()
 
-
     def forward(
         self,
         tokens,
@@ -1432,7 +1431,7 @@ def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_cond
         else:
             cond = audio_conditioning
         # Pos emb, dropout is identity at eval time
-        hidden_states = (hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond)
+        hidden_states = hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond
         return hidden_states, cond
 
     def sample(
@@ -1575,13 +1574,19 @@ def primed_sample(
             # the input of the encoder and decoder can be merged into (lyrics, music tokens)
             input_tokens = sampled_audio[-1]
 
-            itererator = tqdm(range(len(sampled_audio), sample_tokens), desc = f"Sampling {len(range(len(sampled_audio), sample_tokens))} music tokens", leave=False)
+            itererator = tqdm(
+                range(len(sampled_audio), sample_tokens),
+                desc=f"Sampling {len(range(len(sampled_audio), sample_tokens))} music tokens",
+                leave=False,
+            )
             for sample_t in itererator:
                 hidden_states, cond = self.get_emb(
                     sample_t, n_samples, input_tokens, audio_conditioning, metadata_conditioning
                 )
 
-                hidden_states = self.transformer(hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True)
+                hidden_states = self.transformer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True
+                )
                 if self.add_cond_after_transformer:
                     hidden_states = hidden_states + cond
                 hidden_states = self.fc_proj_out(hidden_states)  # Predictions
@@ -1618,7 +1623,7 @@ def __init__(self, config, level):
 
         super().__init__()
         self.embed_tokens = nn.Embedding(config.music_vocab_size, config.hidden_size)
-        config.embed_dim = config.music_vocab_size # setting correct argument for the `JukeboxDecoder`
+        config.embed_dim = config.music_vocab_size  # setting correct argument for the `JukeboxDecoder`
 
         self.upsampler = JukeboxDecoderConvBock(
             config,
@@ -1780,9 +1785,10 @@ class JukeboxPrior(PreTrainedModel):
             Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions
             instead of the vqvae module to avoid getting the parameters.
     """
+
     config_class = JukeboxPriorConfig
 
-    def __init__(self,config: JukeboxPriorConfig,level,nb_priors=3,vqvae_encoder=None,vqvae_decoder=None):
+    def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder=None, vqvae_decoder=None):
 
         super().__init__(config)
         # Passing functions instead of the vqvae module to avoid getting params, only used in the
@@ -2003,7 +2009,9 @@ def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
         if end_level is None:
             end_level = self.levels
         with torch.no_grad():
-            output = self.vqvae_decoder(music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks)
+            output = self.vqvae_decoder(
+                music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks
+            )
         return output
 
     def get_cond(self, music_tokens_conds, metadata):
@@ -2294,7 +2302,9 @@ def __init__(self, config):
         vqvae_config = config.vqvae_config
         self.vqvae = JukeboxVQVAE(vqvae_config)
         self.set_shared_params(config)
-        self.priors = nn.ModuleList([JukeboxPrior(config.prior_configs[level], level) for level in range(config.nb_priors)])
+        self.priors = nn.ModuleList(
+            [JukeboxPrior(config.prior_configs[level], level) for level in range(config.nb_priors)]
+        )
 
     def set_shared_params(self, model_config):
         """
@@ -2327,7 +2337,9 @@ def split_batch(self, obj, n_samples, split_size):
             raise TypeError("Unknown input type")
 
     # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
-    def sample_partial_window(self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample, max_batch_size):
+    def sample_partial_window(
+        self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample, max_batch_size
+    ):
         prior = self.priors[level]
         sampled_tokens = music_tokens[level]
         n_ctx = prior.n_ctx
@@ -2366,7 +2378,6 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
             # Nothing new to sample
             return music_tokens
 
-
         # get music_tokens_conds from level above
         music_tokens_conds = prior.get_music_tokens_conds(music_tokens, start, end)
         # if there are no levels above should return None!
@@ -2382,7 +2393,9 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         for music_tokens_i, music_tokens_conds_i, metadata_i in iterator:
             name = ["Ancestral", "Primed"][music_tokens_i.shape[1] == 0]
             iterator.set_description(
-                    f"[prior level {level}] {name} Sampling {sample_tokens} tokens out of {self.total_length//prior.raw_to_tokens}", refresh=True
+                f"[prior level {level}] {name} Sampling {sample_tokens} tokens out of"
+                f" {self.total_length//prior.raw_to_tokens}",
+                refresh=True,
             )
             tokens_i = prior.sample(
                 n_samples=music_tokens_i.shape[0],
@@ -2400,11 +2413,15 @@ def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, le
         return music_tokens
 
     # Sample total_length tokens at level=level with hop_length=hop_length
-    def sample_level(self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length, max_batch_size):
+    def sample_level(
+        self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length, max_batch_size
+    ):
         if total_length >= self.priors[level].n_ctx:
             iterator = get_starts(total_length, self.priors[level].n_ctx, hop_length)
             for start in iterator:
-                music_tokens = self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size)
+                music_tokens = self.sample_single_window(
+                    music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size
+                )
 
         else:
             music_tokens = self.sample_partial_window(
@@ -2519,7 +2536,14 @@ def _sample(
             hop_length = int(self.config.hop_fraction[level] * self.priors[level].n_ctx)
             max_batch_size = lower_batch_size if level != sample_levels else max_batch_size
             music_tokens = self.sample_level(
-                music_tokens, labels[level], offset, sampling_kwargs, level, total_token_to_sample, hop_length, max_batch_size
+                music_tokens,
+                labels[level],
+                offset,
+                sampling_kwargs,
+                level,
+                total_token_to_sample,
+                hop_length,
+                max_batch_size,
             )
 
             if save_results:
@@ -2532,9 +2556,7 @@ def _sample(
                 logdir = f"jukebox/level_{level}"
                 if not os.path.exists(logdir):
                     os.makedirs(logdir)
-                save_temp_audio(
-                    logdir, level, metas=metas, aud=raw_audio.float()
-                )
+                save_temp_audio(logdir, level, metas=metas, aud=raw_audio.float())
                 if compute_alignments and self.priors[0] is not None and self.priors[0].nb_relevant_lyric_tokens > 0:
                     with torch.no_grad():
                         alignments = get_alignment(music_tokens, labels[-1], self.priors[0], self.config)

From 996a3bb99bfcf01111bfd17450a08fd9653ea149 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 13:31:23 +0000
Subject: [PATCH 178/196] update

---
 .../models/jukebox/configuration_jukebox.py   |  4 +--
 .../models/jukebox/modeling_jukebox.py        | 32 +++++++++----------
 tests/models/jukebox/test_modeling_jukebox.py |  4 +--
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 6d14e972492e7..0a9d34d97c9eb 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -405,8 +405,8 @@ class JukeboxVQVAEConfig(PretrainedConfig):
         res_convolution_multiplier (`int`, *optional*, defaults to 1):
             Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
         res_dilation_cycle (`_type_`, *optional*):
-            Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a
-            depth reduced by a power of `res_dilation_cycle`.
+            Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
+            reduced by a power of `res_dilation_cycle`.
         res_dilation_growth_rate (`int`, *optional*, defaults to 3):
             Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
         res_downs_t (`List[`int`]`, *optional*, defaults to `[3, 2, 2]`):
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2a5bf2cae73e8..66b278cce4163 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1294,9 +1294,9 @@ def __init__(
 
         Args:
             config (`JukeboxPriorConfig`):
-                Model configuration class with all the parameters of the model.
-                Initializing with a config file does not load the weights associated with the model, only the
-                configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+                Model configuration class with all the parameters of the model. Initializing with a config file does
+                not load the weights associated with the model, only the configuration. Check out the
+                [`~PreTrainedModel.from_pretrained`] method to load the model weights.
             n_ctx (`int`, *optional*, defaults to `None)`:
                 number of tokens or lyrics tokens provided in a single pass.
             embed_dim (`int`, *optional*, defaults to `None)`:
@@ -1356,7 +1356,6 @@ def forward(
     ):
         """
         Args:
-
         tokens : composed of both music tokens and lyrics tokens or just music tokens
         depending on the `merged_decoder` flag.
         """
@@ -1764,26 +1763,26 @@ def forward(self, metadata):
 
 class JukeboxPrior(PreTrainedModel):
     """
-    The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be seen
-    as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù is defined,
-    it also models the `next character` prediction on the lyrics. Can be conditionned on timing, artist, genre, lyrics and codes
-    from lower-levels Priors.
+    The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be
+    seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù
+    is defined, it also models the `next character` prediction on the lyrics. Can be conditionned on timing, artist,
+    genre, lyrics and codes from lower-levels Priors.
 
     Args:
         config (`JukeboxPriorConfig`):
-            Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
         level (`int`):
             Current level of the Prior. Should be in range `0,nb_priors`.
         nb_priors (`int`, *optional*, defaults to `3)`:
             Total number of priors.
         vqvae_encoder (`Callable`, *optional*, defaults to `None)`:
-            Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions
-            instead of the vqvae module to avoid getting the parameters.
+            Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions instead of
+            the vqvae module to avoid getting the parameters.
         vqvae_decoder (`Callable`, *optional*, defaults to `None)`:
-            Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions
-            instead of the vqvae module to avoid getting the parameters.
+            Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions instead of
+            the vqvae module to avoid getting the parameters.
     """
 
     config_class = JukeboxPriorConfig
@@ -2455,7 +2454,8 @@ def _sample(
            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
-            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre + lyric_sequence_lenght)` :
+            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, 4 +
+            self.config.max_nb_genre + lyric_sequence_lenght)` :
                 List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
                 condition the generation.
             sample_levels (`List[int]`):
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index e0d77e642a578..657acbfc1b653 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -28,7 +28,7 @@
 @require_torch
 class Jukebox1bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-1b-lyrics"
+    model_id = "jukebox-1b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -218,7 +218,7 @@ def test_vqvae(self):
 @require_torch
 class Jukebox5bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "openai/jukebox-5b-lyrics"
+    model_id = "jukebox-5b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",

From d45fa0e5e079a8fca11f23e28a8a709b7210b736 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 14:37:41 +0000
Subject: [PATCH 179/196] spell check dox

---
 .../models/jukebox/configuration_jukebox.py   |  8 +--
 .../models/jukebox/modeling_jukebox.py        | 61 +++++++++----------
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 0a9d34d97c9eb..a57e669f102b0 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -183,19 +183,19 @@ class JukeboxPriorConfig(PretrainedConfig):
         mask (`bool`, *optional*, defaults to `False`):
             Whether or not to mask the previous positions in the attention.
         max_duration (`int`, *optional*, defaults to 600):
-            #TODO FILLME
+            Maximum supported duration of the generated song in seconds.
         max_nb_genres (`int`, *optional*, defaults to 1):
-            #TODO FILLME
+            Maximum number of genres that can be used to condition the model.
         merged_decoder (`bool`, *optional*, defaults to `True`):
             Whether or not the decoder and the encoder inputs are merged. This is used for the separated
             encoder-decoder architecture
         metadata_conditioning (`bool`, *optional*, defaults to `True)`:
-            #TODO FILLME
+            Whether or not to condition on the artist and genre metadata.
         metadata_dims (`List[`int`]`, *optional*, defaults to `[604, 7898]`):
             Number of genres and the number of artists that were used to train the embedding layers of the prior
             models.
         min_duration (`int`, *optional*, defaults to 0):
-            #TODO FILLME
+            Minimum duration of the generated audio on which the model was trained.
         mlp_multiplier (`float`, *optional*, defaults to 1.0):
             Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
             the model will be used.
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 66b278cce4163..b950fc61b597e 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch Jukebox model."""
+"""Pytorch Jukebox model."""
 
 import math
 import os
@@ -586,8 +586,8 @@ def forward(self, input_audio):
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
 
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    This model is also a Pytorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular Pytorch Module and refer to the Pytorch documentation for all matter related to general usage
     and behavior.
 
     Parameters:
@@ -1290,25 +1290,25 @@ def __init__(
         is_encoder=False,
     ):
         """
-        _summary_
+        Autoregressive model.
 
         Args:
             config (`JukeboxPriorConfig`):
                 Model configuration class with all the parameters of the model. Initializing with a config file does
                 not load the weights associated with the model, only the configuration. Check out the
                 [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-            n_ctx (`int`, *optional*, defaults to `None)`:
+            n_ctx (`int`, *optional*):
                 number of tokens or lyrics tokens provided in a single pass.
-            embed_dim (`int`, *optional*, defaults to `None)`:
+            embed_dim (`int`, *optional*):
                 either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
                 if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
-            audio_conditioning (`bool`, *optional*, defaults to `False)`:
+            audio_conditioning (`bool`, *optional*, defaults to `False`):
                 whether or not the prior supports conditionning on audio.
-            metadata_conditioning (`bool`, *optional*, defaults to `False)`:
+            metadata_conditioning (`bool`, *optional*, defaults to `False`):
                 whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
-            encoder_len (`int`, *optional*, defaults to `0)`:
+            encoder_len (`int`, *optional*, defaults to `0`):
                 length of the encoder #TODO this is related to n_vocab
-            is_encoder (`bool`, *optional*, defaults to `False)`:
+            is_encoder (`bool`, *optional*, defaults to `False`):
                 _description_
         """
 
@@ -1640,7 +1640,7 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
         Args :
             music_tokens (`torch.LongTensor`):
                 Music tokens form the uper level in range(nb_discrete_codes)
-            raw_audio_conditionning (`torch.LongTensor`):
+            raw_audio_conditionning (`torch.LongTensor`, *optional*):
                 Audio used when primed sampling, raw audio information that conditions the generation
         """
         if raw_audio_conditionning is None:
@@ -1775,12 +1775,12 @@ class JukeboxPrior(PreTrainedModel):
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
         level (`int`):
             Current level of the Prior. Should be in range `0,nb_priors`.
-        nb_priors (`int`, *optional*, defaults to `3)`:
+        nb_priors (`int`, *optional*, defaults to 3):
             Total number of priors.
-        vqvae_encoder (`Callable`, *optional*, defaults to `None)`:
+        vqvae_encoder (`Callable`, *optional*):
             Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions instead of
             the vqvae module to avoid getting the parameters.
-        vqvae_decoder (`Callable`, *optional*, defaults to `None)`:
+        vqvae_decoder (`Callable`, *optional*):
             Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions instead of
             the vqvae module to avoid getting the parameters.
     """
@@ -1956,9 +1956,7 @@ def prior_preprocess(self, tokens, conds):
     def prior_postprocess(self, tokens):
         """
         Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
-        shared, prior_embed_dim_shift shifts the music token ids by
-         - nb_vocab.
-        Returns : only returns the music tokens
+        shared, prior_embed_dim_shift shifts the music token ids by nb_vocab. Only returns the music tokens.
         """
         batch_size = tokens.shape[0]
         # dim (nb_lyric_tokens, codebook dim = latent_dim of the model)
@@ -2276,7 +2274,7 @@ def __init__(self, *inputs, **kwargs):
 
 
 JUKEBOX_SAMPLING_INPUT_DOCSTRING = r"""
-            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_lenght)` :
+            labels (`List[torch.LongTensor]` of length `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_length)` :
                 List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
                 condition the generation.
             sampling_kwargs (`Dict[Any]`):
@@ -2451,13 +2449,13 @@ def _sample(
         the generated raw audio at each step.
 
         Args:
-           music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
+           music_tokens (`List[torch.LongTensor] of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
-            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, 4 +
-            self.config.max_nb_genre + lyric_sequence_lenght)` :
-                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
-                condition the generation.
+            labels (`List[torch.LongTensor]`):
+                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
+                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
+                which are used to condition the generation.
             sample_levels (`List[int]`):
                 List of the desired levels at which the sampling will be done. A level is equivalent to the index of
                 the prior in the list of priors
@@ -2473,7 +2471,7 @@ def _sample(
             max_batch_size (`int`, *optional*, defaults to 16):
                 Maximum batch size for the top level priors
             sample_length_in_seconds (`int`, *optional*, defaults to 24):
-                Desired lenght of the generation in seconds
+                Desired length of the generation in seconds
             compute_alignments (`bool`, *optional*, defaults to `False`):
                 Whether or not to compute the alignment between the lyrics and the audio using the top_prior
             sample_tokens (`int`, *optional*):
@@ -2486,7 +2484,7 @@ def _sample(
                 Whether or not to save the intermediate results. If `True`, will generate a folder named with the start
                 time.
             sample_length (`int`, *optional*):
-                Desired lenght of the generation in samples.
+                Desired length of the generation in samples.
 
         Returns: torch.Tensor
 
@@ -2571,9 +2569,10 @@ def _sample(
         the VQ-VAE decoder to convert the music tokens to raw audio.
 
         Args:
-            labels (`List[Torch.LongTensor]` of lenght `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre + lyric_sequence_lenght)` :
-                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
-                condition the generation.
+            labels (`List[torch.LongTensor]`) :
+                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
+                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
+                which are used to condition the generation.
             n_samples (`int`, *optional*, default to 1) :
                 Number of samples to be generated in parallel.
         """,
@@ -2612,7 +2611,7 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
         """Generates a continuation of the previously generated tokens.
 
         Args:
-            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
+            music_tokens (`List[torch.LongTensor]` of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
         """,
@@ -2627,7 +2626,7 @@ def continue_sample(self, music_tokens, labels, **sampling_kwargs) -> List[torch
         """Upsamples a sequence of music tokens using the prior at level `level`.
 
         Args:
-            music_tokens (`List[torch.LongTensor`] of length `self.levels` ) :
+            music_tokens (`List[torch.LongTensor]` of length `self.levels` ) :
                 A sequence of music tokens which will be used as context to continue the sampling process. Should have
                 `self.levels` tensors, each corresponding to the generation at a certain level.
         """,
@@ -2644,7 +2643,7 @@ def upsample(self, music_tokens, labels, **sampling_kwargs) -> List[torch.LongTe
         used: as conditioning for each level, which means that no ancestral sampling is required.
 
         Args:
-            raw_audio (`List[torch.Tensor`] of length `n_samples` ) :
+            raw_audio (`List[torch.Tensor]` of length `n_samples` ) :
                 A list of raw audio that will be used as conditioning information for each samples that will be
                 generated.
         """,

From ce653b368936c6daf308bc9a165d3297f11a4265 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 16:03:41 +0000
Subject: [PATCH 180/196] fix doc and base model prefixes

---
 .../models/jukebox/configuration_jukebox.py   | 20 ++--
 .../models/jukebox/modeling_jukebox.py        | 96 +++++++++++--------
 2 files changed, 68 insertions(+), 48 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index a57e669f102b0..5db5faffd8623 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -238,7 +238,7 @@ class JukeboxPriorConfig(PretrainedConfig):
             Whether or not to zero out convolution weights when initializing.
     """
 
-    model_type = "jukebox"
+    model_type = "jukebox_prior"
     attribute_map = {
         "hidden_size": "vqvae_codebook_dimension",
         "max_position_embeddings": "n_positions",
@@ -248,6 +248,7 @@ class JukeboxPriorConfig(PretrainedConfig):
     def __init__(
         self,
         act_fn="quick_gelu",
+        level=0,
         alignment_head=2,
         alignment_layer=68,
         attention_multiplier=0.25,
@@ -311,6 +312,7 @@ def __init__(
         self.init_scale = init_scale
         self.is_encoder_decoder = is_encoder_decoder
         self.lyric_vocab_size = lyric_vocab_size
+        self.level = level
         self.mask = mask
         self.max_duration = max_duration
         self.max_nb_genres = max_nb_genres
@@ -337,13 +339,14 @@ def __init__(
         self.zero_out = zero_out
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs
+    ) -> "PretrainedConfig":
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the prior config dict if we are loading from JukeboxConfig
-        if config_dict.get("model_type") == "jukebox_prior":
-            config_dict = config_dict["prior_configs"]
+        if config_dict.get("model_type") == "jukebox":
+            config_dict = config_dict[f"prior_{level}"]
 
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warning(
@@ -417,6 +420,8 @@ class JukeboxVQVAEConfig(PretrainedConfig):
             Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
     """
 
+    model_type = "jukebox_vqvae"
+
     def __init__(
         self,
         act_fn="relu",
@@ -537,11 +542,6 @@ class JukeboxConfig(PretrainedConfig):
     """
 
     model_type = "jukebox"
-    attribute_map = {
-        "hidden_size": "codebook_dimension",
-        "max_position_embeddings": "n_positions",
-        "num_attention_heads": "n_head",
-    }
     is_composition = True
 
     def __init__(
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index b950fc61b597e..9d98a205e5329 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -605,7 +605,8 @@ def forward(self, input_audio):
     JUKEBOX_START_DOCSTRING,
 )
 class JukeboxVQVAE(PreTrainedModel):
-    config_class = JukeboxConfig
+    config_class = JukeboxVQVAEConfig
+    base_model_prefix = "vqvae"
 
     def __init__(self, config: JukeboxVQVAEConfig):
         super().__init__(config)
@@ -701,11 +702,11 @@ def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
             input_audio (`torch.Tensor`):
                 Raw audio which will be encoded to its discrete representation using the codebook. The closest `code`
                 form the codebook will be computed for each sequence of samples.
-            start_level (`int`, *optional*):
+            start_level (`int`, *optional*, defaults to 0):
                 Level at which the encoding process will start. Default to 0.
             end_level (`int`, *optional*):
                 Level at which the encoding process will start. Default to None.
-            bs_chunks (int, *optional*):
+            bs_chunks (int, *optional*, defaults to 1):
                 Number of chunks of raw audio to process at the same time.
         """
         audio_chunks = torch.chunk(input_audio, bs_chunks, dim=0)
@@ -742,12 +743,13 @@ def forward(self, raw_audio):
         >>> from transformers import JukeboxVQVAE, set_seed
         >>> import torch
 
-        >>> model = JukeboxVQVAE.from_pretrained("ArthurZ/vqvae-dummy").eval()
+        >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
         >>> set_seed(0)
         >>> zs = [torch.randint(100, (4, 1))]
         >>> model.decode(zs).shape
         torch.Size([4, 8, 1])
-        ```"""
+        ```
+        """
 
         # Encode/Decode
         input_audio = raw_audio.permute(0, 2, 1).float()
@@ -804,7 +806,7 @@ def forward(self, input):
 
 
 class JukeboxAttention(nn.Module):
-    def __init__(self, config, n_ctx, attn_func="dense_attn", encoder_len=None):
+    def __init__(self, config, n_ctx, attn_func="dense_attn"):
         super().__init__()
         self.embed_dim = config.hidden_size
         self.n_heads = config.n_heads
@@ -855,7 +857,7 @@ def __init__(self, config, n_ctx, attn_func="dense_attn", encoder_len=None):
 
         self.sample_t = 0
         self.cache = {}
-        self.encoder_len = encoder_len
+        self.encoder_len = config.nb_relevant_lyric_tokens  # length of the encoder input ids
         self.record_attn = False
 
     def _attn(self, query_states, key_states, value_states, sample):
@@ -1178,15 +1180,10 @@ def del_cache(self):
 
 
 class JukeboxBlock(nn.Module):
-    def __init__(self, config, n_ctx, attn_func="dense_attn", encoder_len=None):
+    def __init__(self, config, n_ctx, attn_func="dense_attn"):
         super().__init__()
         self.width = config.hidden_size
-        self.attn = JukeboxAttention(
-            config=config,
-            n_ctx=n_ctx,
-            attn_func=attn_func,
-            encoder_len=encoder_len,
-        )
+        self.attn = JukeboxAttention(config, n_ctx, attn_func=attn_func)
 
         self.layer_norm_0 = JukeboxLayerNorm(config.hidden_size)
         self.mlp = JukeboxMLP(config)
@@ -1209,7 +1206,7 @@ def forward(self, hidden_states, last_encoder_hidden_states, sample=False):
 
 
 class JukeboxLayerStack(nn.Module):
-    def __init__(self, config, n_ctx, encoder_len=None):
+    def __init__(self, config, n_ctx):
         super().__init__()
         self.n_ctx = n_ctx
         self.width = config.hidden_size
@@ -1218,16 +1215,14 @@ def __init__(self, config, n_ctx, encoder_len=None):
         self.attention_pattern = config.attention_pattern
         if self.blocks is not None:
             self.block_ctx = n_ctx // self.blocks
-        self.encoder_len = encoder_len
+        self.encoder_len = config.nb_relevant_lyric_tokens
         self.n_heads = config.n_heads
 
         # Orders of attn_func
         attention_pattern = ATTENTION_PATTERNS[self.attention_pattern]
         self._attn_mods = nn.ModuleList()
         for depth in range(self.depth):
-            self._attn_mods.append(
-                JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth), encoder_len=encoder_len)
-            )
+            self._attn_mods.append(JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth)))
 
         self.saved_attn_weights = []
 
@@ -1286,7 +1281,6 @@ def __init__(
         embed_dim=None,
         audio_conditioning=False,
         metadata_conditioning=False,
-        encoder_len=0,
         is_encoder=False,
     ):
         """
@@ -1306,8 +1300,6 @@ def __init__(
                 whether or not the prior supports conditionning on audio.
             metadata_conditioning (`bool`, *optional*, defaults to `False`):
                 whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
-            encoder_len (`int`, *optional*, defaults to `0`):
-                length of the encoder #TODO this is related to n_vocab
             is_encoder (`bool`, *optional*, defaults to `False`):
                 _description_
         """
@@ -1326,9 +1318,9 @@ def __init__(
         self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.hidden_size)
         self.pos_emb_dropout = nn.Dropout(config.emb_dropout)
 
-        self.transformer = JukeboxLayerStack(config, n_ctx=self.n_ctx, encoder_len=encoder_len)
+        self.transformer = JukeboxLayerStack(config, n_ctx=self.n_ctx)
         self.is_encoder = is_encoder
-        self.encoder_len = encoder_len
+        self.encoder_len = config.nb_relevant_lyric_tokens
 
         if config.merged_decoder:
             # Merged piped model uses this setup
@@ -1773,7 +1765,7 @@ class JukeboxPrior(PreTrainedModel):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-        level (`int`):
+        level (`int`, *optional*):
             Current level of the Prior. Should be in range `0,nb_priors`.
         nb_priors (`int`, *optional*, defaults to 3):
             Total number of priors.
@@ -1787,7 +1779,36 @@ class JukeboxPrior(PreTrainedModel):
 
     config_class = JukeboxPriorConfig
 
-    def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder=None, vqvae_decoder=None):
+    def _init_weights(self, module):
+        init_scale = self.config.init_scale
+
+        if isinstance(module, nn.Embedding):  # embed_tokens
+            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxConv1D):
+            if self.config.zero_out:
+                module.weight.data.zero_()
+            else:
+                module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxPositionalEmbedding):
+            module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxRangeEmbedding):
+            module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, nn.Linear) and "encoder.lm_head" in module.__class__.__name__:
+            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, nn.Parameter) and "pos_emb" in module.__class__.__name__:
+            module.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, nn.Parameter) and "start_token" in module.__class__.__name__:
+            module.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
+            module.conv1d_2.weigth.data.zero_()
+            module.conv1d_2.bias.data.zero_()
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_encoder=None, vqvae_decoder=None):
 
         super().__init__(config)
         # Passing functions instead of the vqvae module to avoid getting params, only used in the
@@ -1796,7 +1817,8 @@ def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder
         self.vqvae_decoder = vqvae_decoder
 
         self.levels = nb_priors
-        self.level = level
+        self.level = level if level is not None else config.level
+        self.base_model_prefix = f"priors.{self.level}"
         self.n_ctx = config.n_ctx
 
         self.lyric_conditioning = config.nb_relevant_lyric_tokens > 0
@@ -1804,8 +1826,8 @@ def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder
         self.encoder_loss_fraction = config.encoder_loss_fraction
 
         # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both)
-        self.audio_conditioning = level != 0
-        self.cond_level = level - 1
+        self.audio_conditioning = self.level != 0
+        self.cond_level = self.level - 1
         if self.audio_conditioning:
             self.conditioner_blocks = JukeboxMusicTokenConditioner(config, self.level)
 
@@ -1823,7 +1845,7 @@ def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder
             self.embed_dim_shift = [0, config.lyric_vocab_size]
             self.width = config.hidden_size
 
-            self.lyrics_enc_loss_dims = config.nb_relevant_lyric_tokens
+            self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens
 
             self.prior = JukeboxConditionalAutoregressive(
                 config,
@@ -1831,7 +1853,6 @@ def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder
                 embed_dim=config.lyric_vocab_size + config.music_vocab_size,
                 audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
                 metadata_conditioning=True,
-                encoder_len=self.lyrics_enc_loss_dims,
             )
 
         else:
@@ -1840,7 +1861,6 @@ def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder
             encoder_config = config.encoder_config
 
             if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
-                self.lyrics_enc_loss_dims = self.nb_relevant_lyric_tokens
                 self.lyric_acts_width = encoder_config.hidden_size
                 self.encoder_width = config.hidden_size
                 self.encoder_dim = config.lyric_vocab_size
@@ -1856,7 +1876,7 @@ def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder
                 self.encoder.final_layer_norm = JukeboxLayerNorm(config.hidden_size)
                 self.encoder.lm_head = nn.Linear(config.hidden_size, config.lyric_vocab_size, bias=False)
             else:
-                self.lyrics_enc_loss_dims = 0
+                self.nb_relevant_lyric_tokens = 0
 
             # decoder model on the tokens
             self.prior = JukeboxConditionalAutoregressive(
@@ -1866,15 +1886,15 @@ def __init__(self, config: JukeboxPriorConfig, level, nb_priors=3, vqvae_encoder
             )
 
         self.next_token_prediction_loss_dims = config.n_ctx
-        self.total_loss_dims = self.lyrics_enc_loss_dims + self.next_token_prediction_loss_dims
+        self.total_loss_dims = self.nb_relevant_lyric_tokens + self.next_token_prediction_loss_dims
 
         self.downsamples = [stride**down for stride, down in zip(config.res_strides_t, config.res_downs_t)]
-        self.cond_downsample = self.downsamples[level] if level != 0 else None
-        self.raw_to_tokens = np.prod(self.downsamples[: nb_priors - level])
+        self.cond_downsample = self.downsamples[self.level] if self.level != 0 else None
+        self.raw_to_tokens = np.prod(self.downsamples[: nb_priors - self.level])
         self.sample_length = self.n_ctx * self.raw_to_tokens
 
         logger.info(
-            f"Level:{level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample"
+            f"Level:{self.level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample"
             f" length:{self.sample_length}"
         )
 
@@ -2183,7 +2203,7 @@ def forward_tokens(
                 last_encoder_hidden_states,
                 get_preds=get_preds,
             )
-        loss = self.encoder_loss_fraction * encoder_loss * self.lyrics_enc_loss_dims / self.total_loss_dims
+        loss = self.encoder_loss_fraction * encoder_loss * self.nb_relevant_lyric_tokens / self.total_loss_dims
         loss += next_token_prediction_loss * self.next_token_prediction_loss_dims / self.total_loss_dims
 
         metrics = dict(

From 5b5772cce0f6d04cbe21787763fc74ab1d447bda Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 16:10:57 +0000
Subject: [PATCH 181/196] rename depth to num_layers

---
 src/transformers/models/jukebox/configuration_jukebox.py | 8 ++++----
 src/transformers/models/jukebox/modeling_jukebox.py      | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 5db5faffd8623..5c599eb22f60d 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -165,8 +165,8 @@ class JukeboxPriorConfig(PretrainedConfig):
         conv_res_scale (`int`, *optional*):
             Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
             conditioner, the default value is to None and should not be modified.
-        depth (`int`, *optional*, defaults to 72):
-            Number of layers of the decoder architecture. #TODO replace with num decoder_layers?
+        num_layers (`int`, *optional*, defaults to 72):
+            Number of layers of the transformer architecture.
         emb_dropout (`int`, *optional*, defaults to 0):
             Embedding dropout used in the lyric decoder.
         encoder_config (`JukeboxPriorConfig`, *optional*) :
@@ -257,7 +257,7 @@ def __init__(
         attn_res_scale=False,
         blocks=64,
         conv_res_scale=None,
-        depth=72,
+        num_decoder_layers=72,
         emb_dropout=0,
         encoder_config=None,
         encoder_loss_fraction=0.4,
@@ -301,7 +301,7 @@ def __init__(
         self.attn_res_scale = attn_res_scale
         self.blocks = blocks
         self.conv_res_scale = conv_res_scale
-        self.depth = depth
+        self.num_decoder_layers = num_decoder_layers
         self.emb_dropout = emb_dropout
         self.music_vocab_size = music_vocab_size
         if encoder_config is not None:
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 9d98a205e5329..2b4f6a3601399 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1188,7 +1188,7 @@ def __init__(self, config, n_ctx, attn_func="dense_attn"):
         self.layer_norm_0 = JukeboxLayerNorm(config.hidden_size)
         self.mlp = JukeboxMLP(config)
         self.layer_norm_1 = JukeboxLayerNorm(config.hidden_size)
-        self.res_scale = 1.0 / config.depth if config.attn_res_scale else 1.0
+        self.res_scale = 1.0 / config.num_layers if config.attn_res_scale else 1.0
         self.attn_func = attn_func
 
     def forward(self, hidden_states, last_encoder_hidden_states, sample=False):
@@ -1210,7 +1210,7 @@ def __init__(self, config, n_ctx):
         super().__init__()
         self.n_ctx = n_ctx
         self.width = config.hidden_size
-        self.depth = config.depth
+        self.num_layers = config.num_layers
         self.blocks = config.blocks
         self.attention_pattern = config.attention_pattern
         if self.blocks is not None:
@@ -1221,7 +1221,7 @@ def __init__(self, config, n_ctx):
         # Orders of attn_func
         attention_pattern = ATTENTION_PATTERNS[self.attention_pattern]
         self._attn_mods = nn.ModuleList()
-        for depth in range(self.depth):
+        for depth in range(self.num_layers):
             self._attn_mods.append(JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth)))
 
         self.saved_attn_weights = []
@@ -1306,7 +1306,7 @@ def __init__(
 
         super().__init__()
         self.width = config.hidden_size
-        self.depth = config.depth
+        self.num_layers = config.num_layers
         self.n_ctx = n_ctx if n_ctx is not None else config.n_ctx
         self.embed_dim = embed_dim if embed_dim is not None else config.music_vocab_size
         self.embed_tokens = nn.Embedding(self.embed_dim, config.hidden_size)

From 0ea6ba4fa048c43e300c5b64663f090331f3a1a4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 16:13:30 +0000
Subject: [PATCH 182/196] update doc

---
 src/transformers/models/jukebox/configuration_jukebox.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 5c599eb22f60d..d40e4057fe128 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -399,7 +399,7 @@ class JukeboxVQVAEConfig(PretrainedConfig):
         lmu (`float`, *optional*, defaults to 0.99):
             Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
             of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
-        multipliers (`tuple`, *optional*, defaults to `(2, 1, 1)`):
+        multipliers (`List[`int`]`, *optional*, defaults to `[2, 1, 1]`):
             Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
         res_conv_depth (`int`, *optional*, defaults to 4):
             Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
@@ -433,14 +433,14 @@ def __init__(
         hop_fraction=[0.125, 0.5, 0.5],
         levels=3,
         lmu=0.99,
-        multipliers=(2, 1, 1),
+        multipliers=[2, 1, 1],
         res_conv_depth=4,
         res_conv_width=32,
         res_convolution_multiplier=1,
         res_dilation_cycle=None,
         res_dilation_growth_rate=3,
-        res_downs_t=(3, 2, 2),
-        res_strides_t=(2, 2, 2),
+        res_downs_t=[3, 2, 2],
+        res_strides_t=[2, 2, 2],
         sample_length=1058304,
         **kwargs
     ):

From 479047bb2cf3a12ff2916679f23df87d632de9c1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 16:23:02 +0000
Subject: [PATCH 183/196] Update init and add JukeboxPrior and JukeboxVQVAE
 configs to doc

---
 docs/source/en/model_doc/jukebox.mdx        | 24 +++++++++++++++------
 src/transformers/__init__.py                | 16 ++++++++++++--
 src/transformers/models/jukebox/__init__.py |  7 +++++-
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 8f145571e7d90..2351e8e84b70f 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -41,25 +41,35 @@ The original code can be found [here](https://github.com/openai/jukebox).
 
 [[autodoc]] JukeboxConfig
 
+## JukeboxModel
+
+[[autodoc]] JukeboxModel
+    - ancestral_sample
+    - primed_sample
+    - continue_sample
+    - upsample
+    - _sample
+
 ## JukeboxTokenizer
 
 [[autodoc]] JukeboxTokenizer
     - save_vocabulary
 
+## JukeboxPriorConfig
+
+[[autodoc]] JukeboxPriorConfig
+
 ## JukeboxPrior
 
 [[autodoc]] JukeboxPrior
     - sample
     - forward
 
-## JukeboxModel
 
-[[autodoc]] JukeboxModel
-    - ancestral_sample
-    - primed_sample
-    - continue_sample
-    - upsample
-    - _sample
+## JukeboxVQVAEConfig
+
+[[autodoc]] JukeboxVQVAEConfig
+
 
 ## JukeboxVQVAE
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 68dd6ea22b611..6dac658d4faa6 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -248,7 +248,13 @@
     "models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
     "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
     "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"],
-    "models.jukebox": ["JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "JukeboxConfig", "JukeboxTokenizer"],
+    "models.jukebox": [
+        "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "JukeboxConfig",
+        "JukeboxPriorConfig",
+        "JukeboxTokenizer",
+        "JukeboxVQVAEConfig",
+    ],
     "models.layoutlm": ["LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMConfig", "LayoutLMTokenizer"],
     "models.layoutlmv2": [
         "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -3353,7 +3359,13 @@
     from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
     from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
     from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig
-    from .models.jukebox import JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP, JukeboxConfig, JukeboxTokenizer
+    from .models.jukebox import (
+        JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        JukeboxConfig,
+        JukeboxPriorConfig,
+        JukeboxTokenizer,
+        JukeboxVQVAEConfig,
+    )
     from .models.layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig, LayoutLMTokenizer
     from .models.layoutlmv2 import (
         LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index 8272d8204450f..f17624c1418ed 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -22,7 +22,12 @@
 
 
 _import_structure = {
-    "configuration_jukebox": ["JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "JukeboxConfig"],
+    "configuration_jukebox": [
+        "JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "JukeboxConfig",
+        "JukeboxPriorConfig",
+        "JukeboxVQVAEConfig",
+    ],
     "tokenization_jukebox": ["JukeboxTokenizer"],
 }
 

From 18300a7bd1c627f0f52a709fc4e926dcf9f48241 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 16:33:11 +0000
Subject: [PATCH 184/196] fix typechecking

---
 src/transformers/models/jukebox/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index f17624c1418ed..521bb40fb6dab 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -46,7 +46,7 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_jukebox import JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP, JukeboxConfig
+    from .configuration_jukebox import JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP, JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig
     from .tokenization_jukebox import JukeboxTokenizer
 
     try:

From 84c2ee0b6005554e899df72e1d7cd9fe86ef61d5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 18:25:44 +0000
Subject: [PATCH 185/196] fixup

---
 src/transformers/models/jukebox/__init__.py   |  7 +++-
 .../models/jukebox/configuration_jukebox.py   | 28 +++++++--------
 .../models/jukebox/modeling_jukebox.py        | 24 +++++--------
 .../models/jukebox/tokenization_jukebox.py    | 36 +++++++++----------
 4 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/jukebox/__init__.py b/src/transformers/models/jukebox/__init__.py
index 521bb40fb6dab..774e06bc3409b 100644
--- a/src/transformers/models/jukebox/__init__.py
+++ b/src/transformers/models/jukebox/__init__.py
@@ -46,7 +46,12 @@
     ]
 
 if TYPE_CHECKING:
-    from .configuration_jukebox import JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP, JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig
+    from .configuration_jukebox import (
+        JUKEBOX_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        JukeboxConfig,
+        JukeboxPriorConfig,
+        JukeboxVQVAEConfig,
+    )
     from .tokenization_jukebox import JukeboxTokenizer
 
     try:
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index d40e4057fe128..608ebbfd04f7e 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -191,7 +191,7 @@ class JukeboxPriorConfig(PretrainedConfig):
             encoder-decoder architecture
         metadata_conditioning (`bool`, *optional*, defaults to `True)`:
             Whether or not to condition on the artist and genre metadata.
-        metadata_dims (`List[`int`]`, *optional*, defaults to `[604, 7898]`):
+        metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`):
             Number of genres and the number of artists that were used to train the embedding layers of the prior
             models.
         min_duration (`int`, *optional*, defaults to 0):
@@ -222,18 +222,18 @@ class JukeboxPriorConfig(PretrainedConfig):
             tokens.
         res_dilation_growth_rate (`int`, *optional*, defaults to 1):
             Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
-        res_downs_t (`List[`int`]`, *optional*, defaults to `[3, 2, 2]`):
+        res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
             Downsampling rates used in the audio conditioning network
-        res_strides_t (`List[`int`]`, *optional*, defaults to `[2, 2, 2]`):
+        res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
             Striding used in the audio conditioning network
         resid_dropout (`int`, *optional*, defaults to 0):
             Residual dropout used in the attention pattern.
         sampling_rate (`int`, *optional*, defaults to 44100):
-            _description_
+            Sampling rate used for training.
         spread (`int`, *optional*):
             Spread used in the `summary_spread_attention` pattern
         timing_dims (`int`, *optional*, defaults to 64):
-            _description_
+            Dimension of the timing embedding.
         zero_out (`bool`, *optional*, defaults to `False`):
             Whether or not to zero out convolution weights when initializing.
     """
@@ -381,7 +381,7 @@ class JukeboxVQVAEConfig(PretrainedConfig):
 
     Args:
         act_fn (`str`, *optional*, defaults to `"relu"`):
-            _description_
+            Activation function of the model.
         nb_discrete_codes (`int`, *optional*, defaults to 2048):
             Number of codes of the VQVAE.
         commit (`float`, *optional*, defaults to 0.02):
@@ -392,14 +392,14 @@ class JukeboxVQVAEConfig(PretrainedConfig):
             Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
         embed_dim (`int`, *optional*, defaults to 64):
             Embedding dimension of the codebook vectors.
-        hop_fraction (`List[`int`]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
+        hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
             Fraction of non-intersecting window used when continuing the sampling process.
         levels (`int`, *optional*, defaults to 3):
             Number of hierarchical levels that used in the VQVAE.
         lmu (`float`, *optional*, defaults to 0.99):
             Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
             of the original [VQVAE paper](https://arxiv.org/pdf/1711.00937v2.pdf)
-        multipliers (`List[`int`]`, *optional*, defaults to `[2, 1, 1]`):
+        multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
             Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
         res_conv_depth (`int`, *optional*, defaults to 4):
             Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
@@ -407,14 +407,14 @@ class JukeboxVQVAEConfig(PretrainedConfig):
             Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
         res_convolution_multiplier (`int`, *optional*, defaults to 1):
             Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
-        res_dilation_cycle (`_type_`, *optional*):
+        res_dilation_cycle (`int`, *optional*):
             Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
             reduced by a power of `res_dilation_cycle`.
         res_dilation_growth_rate (`int`, *optional*, defaults to 3):
             Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
-        res_downs_t (`List[`int`]`, *optional*, defaults to `[3, 2, 2]`):
+        res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
             Downsampling rate for each level of the hierarchical VQ-VAE.
-        res_strides_t (`List[`int`]`, *optional*, defaults to `[2, 2, 2]`):
+        res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
             Stride used for each level of the hierarchical VQ-VAE.
         sample_length (`int`, *optional*, defaults to 1058304):
             Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
@@ -500,9 +500,9 @@ class JukeboxConfig(PretrainedConfig):
 
     Args:
         vqvae_config (`JukeboxVQVAEConfig`, *optional*):
-            _description_
-        prior_config_list (`List[`JukeboxPriorConfig`]`, *optional*):
-            _description_
+            Configuration for the `JukeboxVQVAE` model.
+        prior_config_list (`List[JukeboxPriorConfig]`, *optional*):
+            List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors.
         nb_priors (`int`, *optional*, defaults to 3):
             Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
             (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 2b4f6a3601399..e2b8d81e1b1d3 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -440,11 +440,8 @@ def update_codebook(self, hidden_states, latent_states):
         return dict(entropy=entropy, used_curr=used_curr, usage=usage, dk=dk)
 
     def preprocess(self, hidden_states):
-        # NCT -> NTC -> [NT, C]
         hidden_states = hidden_states.permute(0, 2, 1).contiguous()
-        hidden_states = hidden_states.view(
-            -1, hidden_states.shape[-1]
-        )  # x_en = (batch_size *L, w), k_j = (w, nb_discrete_codes)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
 
         if hidden_states.shape[-1] == self.codebook_width:
             prenorm = torch.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(np.prod(hidden_states.shape))
@@ -460,7 +457,6 @@ def preprocess(self, hidden_states):
         return hidden_states, prenorm
 
     def postprocess(self, latent_states, dequantised_states, x_shape):
-        # [NT, C] -> NTC -> NCT
         batch_size, time = x_shape
         dequantised_states = dequantised_states.view(batch_size, time, -1).permute(0, 2, 1).contiguous()
         latent_states = latent_states.view(batch_size, time)
@@ -1479,7 +1475,6 @@ def sample(
             tokens = torch.cat(sampled_tokens, dim=1)
             if get_preds:
                 preds = torch.cat(preds, dim=1)
-            # tokens = self.postprocess(tokens, sample_tokens)
         if get_preds:
             return tokens, preds
         else:
@@ -1597,7 +1592,6 @@ def primed_sample(
             music_tokens = torch.cat(sampled_audio, dim=1)
             if get_preds:
                 preds = torch.cat(preds, dim=1)
-            # music_tokens = self.postprocess(music_tokens, sample_tokens)
         if get_preds:
             return music_tokens, preds
         else:
@@ -1691,8 +1685,8 @@ def forward(self, pos_start, pos_end=None):
             position = pos_start
 
         # Bin each value to bins_
-        normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min)  # [0,1)
         # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1
+        normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min)
         bins_ = (self.embed_dim * normalised_position).floor().long().detach()
         return self.emb(bins_)
 
@@ -2065,12 +2059,12 @@ def sample(
         Args:
             n_samples (`int`):
                 Number of samples to generate.
-            music_tokens (`List[`torch.LongTensor`]`, *optional*):
+            music_tokens (`List[torch.LongTensor]`, *optional*):
                 Previously gemerated tokens at the current level. Used as context for the generation.
-            music_tokens_conds (`List[`torch.FloatTensor`]`, *optional*):
+            music_tokens_conds (`List[torch.FloatTensor]`, *optional*):
                 Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
                 conditionned on the upper-level tokens.
-            metadata (`List[`torch.LongTensor`]`, *optional*):
+            metadata (`List[torch.LongTensor]`, *optional*):
                 List containing the metatdata tensor with the artist, genre and the lyric tokens.
             temp (`float`, *optional*, defaults to 1.0):
                 Sampling temperature.
@@ -2228,7 +2222,7 @@ def forward(self, hidden_states, metadata=None, decode=False, get_preds=False):
         Args:
             hidden_states (`torch.Tensor`):
                 Hidden states which should be raw audio
-            metadata (`List[`torch.LongTensor`]`, *optional*):
+            metadata (`List[torch.LongTensor]`, *optional*):
                 List containing the metadata conditioning tensorwith the lyric and the metadata tokens.
             decode (`bool`, *optional*, defaults to `False`):
                 Whether or not to decode the encoded to tokens.
@@ -2305,9 +2299,9 @@ def __init__(self, *inputs, **kwargs):
 
 @add_start_docstrings(
     """The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`,
-`continue_sample` and `ancestral_sample`.
-    It does not have a `forward` method as the training is not end to end. If you want to fine-tune the model, it is
-    recommended to use the `JukeboxPrior` class and train each prior individually.
+    `continue_sample` and `ancestral_sample`. It does not have a `forward` method as the training is not end to end. If
+    you want to fine-tune the model, it is recommended to use the `JukeboxPrior` class and train each prior
+    individually.
     """,
     JUKEBOX_START_DOCSTRING,
 )
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 15b03365f9eac..1a996b77f0d0d 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -17,13 +17,14 @@
 
 import json
 import os
+import re
 import unicodedata
 from json.encoder import INFINITY
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
-import regex as re
+import regex
 from transformers.utils.generic import _is_jax, _is_numpy
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
@@ -153,7 +154,7 @@ def __init__(
         if len(self.lyrics_encoder) == 79:
             oov = oov.replace("\-'", "\-+'")
 
-        self.out_of_vocab = re.compile(oov)
+        self.out_of_vocab = regex.compile(oov)
         self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
         self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
         self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
@@ -183,7 +184,7 @@ def _tokenize(self, lyrics):
         Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
         vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
 
-        Do NOT take care of added tokens. Only the lytrics are split into character for the character-based vocabulary.
+        Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
         """
         # only lyrics are not tokenized, but character based is easily handled
         return [character for character in lyrics]
@@ -209,7 +210,7 @@ def prepare_for_tokenization(
             artist (`str`):
                 The artist name to prepare. This will mostly lower the string
             genres (`str`):
-                The gnere name to prepare. This will mostly lower the string.
+                The genre name to prepare. This will mostly lower the string.
             lyrics (`str`):
                 The lyrics to prepare.
             is_split_into_words (`bool`, *optional*, defaults to `False`):
@@ -227,10 +228,10 @@ def prepare_for_tokenization(
                 artists[idx] = self._normalize(artists[idx]) + ".v2"
                 genres[idx] = [
                     self._normalize(genre) + ".v2" for genre in genres[idx].split("_")
-                ]  # split is for the full dictionnary with combined genres
+                ]  # split is for the full dictionary with combined genres
 
         if self.version[0] == "v2":
-            self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
+            self.out_of_vocab = regex.compile("[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
             vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
             self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
             self.vocab["<unk>"] = 0
@@ -239,7 +240,7 @@ def prepare_for_tokenization(
             self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
             self.lyrics_decoder[0] = ""
         else:
-            self.out_of_vocab = re.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
+            self.out_of_vocab = regex.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
 
         lyrics = self._run_strip_accents(lyrics)
         lyrics = lyrics.replace("\\", "\n")
@@ -258,13 +259,13 @@ def _run_strip_accents(self, text):
         return "".join(output)
 
     def _normalize(self, text: str) -> str:
-        """Normalizes the input text. This process is for the genres and the artit
+        """
+        Normalizes the input text. This process is for the genres and the artist
 
         Args:
             text (`str`):
                 Artist or Genre string to normalize
         """
-        import re
 
         accepted = (
             [chr(i) for i in range(ord("a"), ord("z") + 1)]
@@ -273,9 +274,9 @@ def _normalize(self, text: str) -> str:
             + ["."]
         )
         accepted = frozenset(accepted)
-        rex = re.compile(r"_+")
+        pattern = re.compile(r"_+")
         text = "".join([c if c in accepted else "_" for c in text.lower()])
-        text = rex.sub("_", text).strip("_")
+        text = pattern.sub("_", text).strip("_")
         return text
 
     def convert_lyric_tokens_to_string(self, lyrics: List[str]) -> str:
@@ -367,16 +368,11 @@ def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncod
             )
             for i in range(len(self.version))
         ]
-        return BatchEncoding(
-            {
-                "input_ids": input_ids,
-                "attention_masks": attention_masks,
-            }
-        )
+        return BatchEncoding({"input_ids": input_ids, "attention_masks": attention_masks})
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
-        Saves the tokenizer's vocabulary dictionnary to the provided save_directory.
+        Saves the tokenizer's vocabulary dictionary to the provided save_directory.
 
         Args:
             save_directory (`str`):
@@ -414,9 +410,9 @@ def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
 
         Args:
             artists_index (`int`):
-                Index of the artist in its corresponding dictionnary.
+                Index of the artist in its corresponding dictionary.
             genres_index (`Union[List[int], int]`):
-               Index of the genre in its corresponding dictionnary.
+               Index of the genre in its corresponding dictionary.
             lyric_index (`List[int]`):
                 List of character indices, which each correspond to a character.
         """

From 4c981282a68e6f5fe92324dda90a5bb8be4ed281 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 18:52:04 +0000
Subject: [PATCH 186/196] update to mask missing keys from other priors

---
 .../models/jukebox/configuration_jukebox.py            |  4 ++--
 src/transformers/models/jukebox/modeling_jukebox.py    |  6 +++++-
 .../models/jukebox/tokenization_jukebox.py             | 10 ++++++----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 608ebbfd04f7e..97c89457a825c 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -257,7 +257,7 @@ def __init__(
         attn_res_scale=False,
         blocks=64,
         conv_res_scale=None,
-        num_decoder_layers=72,
+        num_layers=72,
         emb_dropout=0,
         encoder_config=None,
         encoder_loss_fraction=0.4,
@@ -301,7 +301,7 @@ def __init__(
         self.attn_res_scale = attn_res_scale
         self.blocks = blocks
         self.conv_res_scale = conv_res_scale
-        self.num_decoder_layers = num_decoder_layers
+        self.num_layers = num_layers
         self.emb_dropout = emb_dropout
         self.music_vocab_size = music_vocab_size
         if encoder_config is not None:
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index e2b8d81e1b1d3..18fb7276f90b2 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -603,6 +603,7 @@ def forward(self, input_audio):
 class JukeboxVQVAE(PreTrainedModel):
     config_class = JukeboxVQVAEConfig
     base_model_prefix = "vqvae"
+    _keys_to_ignore_on_load_unexpected = [r"priors"]
 
     def __init__(self, config: JukeboxVQVAEConfig):
         super().__init__(config)
@@ -1772,6 +1773,7 @@ class JukeboxPrior(PreTrainedModel):
     """
 
     config_class = JukeboxPriorConfig
+    _keys_to_ignore_on_load_unexpected = ["vqvae"]
 
     def _init_weights(self, module):
         init_scale = self.config.init_scale
@@ -1803,7 +1805,6 @@ def _init_weights(self, module):
             module.bias.data.zero_()
 
     def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_encoder=None, vqvae_decoder=None):
-
         super().__init__(config)
         # Passing functions instead of the vqvae module to avoid getting params, only used in the
         # forward loop
@@ -1812,7 +1813,10 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en
 
         self.levels = nb_priors
         self.level = level if level is not None else config.level
+
         self.base_model_prefix = f"priors.{self.level}"
+        self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level]
+
         self.n_ctx = config.n_ctx
 
         self.lyric_conditioning = config.nb_relevant_lyric_tokens > 0
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 1a996b77f0d0d..13b17ddb969f5 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -351,7 +351,7 @@ def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncod
                 Name of the artist.
             genres (`str`):
                 List of genres that will be mixed to condition the audio
-            lyrics (`str`, *optional*):
+            lyrics (`str`, *optional*, defaults to `""`):
                 Lyrics used to condition the generation
         """
         input_ids = [0, 0, 0]
@@ -376,9 +376,11 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
 
         Args:
             save_directory (`str`):
-                _description_
-            filename_prefix (`Optional[str]`, *optional*, defaults to None):
-                _description_
+                A path to the directory where to saved. It will be created if it doesn't exist.
+
+            filename_prefix (`Optional[str]`, *optional*):
+                A prefix to add to the names of the files saved by the tokenizer.
+
         """
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")

From 7ee9b0d2a8af3e61aff81117193df14c683fc9d6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 9 Nov 2022 21:16:42 +0000
Subject: [PATCH 187/196] update initialization

---
 .../models/jukebox/configuration_jukebox.py   | 12 +++-
 .../models/jukebox/modeling_jukebox.py        | 56 ++++++++-----------
 2 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 97c89457a825c..ec60edbb6ca52 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -291,7 +291,7 @@ def __init__(
         zero_out=False,
         **kwargs
     ):
-
+        super().__init__()
         self.act_fn = act_fn
         self.alignment_head = alignment_head
         self.alignment_layer = alignment_layer
@@ -418,6 +418,10 @@ class JukeboxVQVAEConfig(PretrainedConfig):
             Stride used for each level of the hierarchical VQ-VAE.
         sample_length (`int`, *optional*, defaults to 1058304):
             Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
+        init_scale (`float`, *optional*, defaults to 0.2):
+            Initialization scale.
+        zero_out (`bool`, *optional*, defaults to `False`):
+            Whether or not to zero out convolution weights when initializing.
     """
 
     model_type = "jukebox_vqvae"
@@ -442,9 +446,11 @@ def __init__(
         res_downs_t=[3, 2, 2],
         res_strides_t=[2, 2, 2],
         sample_length=1058304,
+        init_scale=0.2,
+        zero_out=False,
         **kwargs
     ):
-
+        super().__init__()
         self.hop_fraction = hop_fraction
         self.conv_input_shape = conv_input_shape
         self.sample_length = sample_length
@@ -465,6 +471,8 @@ def __init__(
         self.commit = commit
         self.conv_res_scale = conv_res_scale
         self.act_fn = act_fn
+        self.init_scale = init_scale
+        self.zero_out = zero_out
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 18fb7276f90b2..fe7509f5cf66f 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -605,6 +605,23 @@ class JukeboxVQVAE(PreTrainedModel):
     base_model_prefix = "vqvae"
     _keys_to_ignore_on_load_unexpected = [r"priors"]
 
+    def _init_weights(self, module):
+        if isinstance(module, nn.Embedding):  # embed_tokens
+            module.weight.data.normal_(mean=0.0, std=0.02 * self.config.init_scale)
+        elif isinstance(module, JukeboxConv1D):
+            if self.config.zero_out:
+                module.weight.data.zero_()
+            else:
+                module.weight.data.normal_(mean=0.0, std=0.02 * self.config.init_scale)
+        elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
+            module.conv1d_2.weight.data.zero_()
+            module.conv1d_2.bias.data.zero_()
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
     def __init__(self, config: JukeboxVQVAEConfig):
         super().__init__(config)
         downs_t = config.res_downs_t
@@ -1789,12 +1806,10 @@ def _init_weights(self, module):
             module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
         elif isinstance(module, JukeboxRangeEmbedding):
             module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale)
-        elif isinstance(module, nn.Linear) and "encoder.lm_head" in module.__class__.__name__:
-            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
-        elif isinstance(module, nn.Parameter) and "pos_emb" in module.__class__.__name__:
-            module.data.normal_(mean=0.0, std=0.01 * init_scale)
-        elif isinstance(module, nn.Parameter) and "start_token" in module.__class__.__name__:
-            module.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "lm_head"):
+            module.lm_head.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "start_token"):
+            module.start_token.data.normal_(mean=0.0, std=0.01 * init_scale)
         elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
             module.conv1d_2.weigth.data.zero_()
             module.conv1d_2.bias.data.zero_()
@@ -2259,33 +2274,8 @@ class JukeboxPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = False
 
     def _init_weights(self, module):
-        init_scale = self.config.init_scale
-
-        if isinstance(module, nn.Embedding):  # embed_tokens
-            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
-        elif isinstance(module, JukeboxConv1D):
-            if self.config.zero_out:
-                module.weight.data.zero_()
-            else:
-                module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
-        elif isinstance(module, JukeboxPositionalEmbedding):
-            module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
-        elif isinstance(module, JukeboxRangeEmbedding):
-            module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale)
-        elif isinstance(module, nn.Linear) and "encoder.lm_head" in module.__class__.__name__:
-            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
-        elif isinstance(module, nn.Parameter) and "pos_emb" in module.__class__.__name__:
-            module.data.normal_(mean=0.0, std=0.01 * init_scale)
-        elif isinstance(module, nn.Parameter) and "start_token" in module.__class__.__name__:
-            module.data.normal_(mean=0.0, std=0.01 * init_scale)
-        elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
-            module.conv1d_2.weigth.data.zero_()
-            module.conv1d_2.bias.data.zero_()
-        if isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
+        if isinstance(module, JukeboxPrior) or isinstance(module, JukeboxVQVAE):
+            module.apply(module._init_weights)
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)

From 9adc384ae0061dc42dbf9b09b9151ff6b29eb235 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 09:15:58 +0000
Subject: [PATCH 188/196] fix last test

---
 .../models/jukebox/configuration_jukebox.py   |  2 +-
 .../models/jukebox/modeling_jukebox.py        | 22 +++++++++----------
 tests/models/jukebox/test_modeling_jukebox.py |  6 ++---
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index ec60edbb6ca52..ae11adc377d9c 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -503,7 +503,7 @@ class JukeboxConfig(PretrainedConfig):
 
 
     The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
-    (5,3), and strides = (2, 2) will downsample the audio by 2**5 = 32 to get the first level of codes, and 2**8 = 256
+    (5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256
     to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
 
     Args:
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index fe7509f5cf66f..07d152936d068 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Pytorch Jukebox model."""
+"""PyTorch Jukebox model."""
 
 import math
 import os
@@ -179,10 +179,10 @@ def save_temp_audio(fname, lvl, metas, aud):
     for i in list(range(aud.shape[0])):
         if metas is not None:
             artists, genres, lyrics = list(metas)[i].values()
-            path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}.wav"
+            path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}"
             np.save(path, aud[i])
         else:
-            np.save(f"{fname}/lvl_{lvl}-sample-{i}.wav", aud[i])
+            np.save(f"{fname}/lvl_{lvl}-sample-{i}", aud[i])
 
 
 def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t):
@@ -582,8 +582,8 @@ def forward(self, input_audio):
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
 
-    This model is also a Pytorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular Pytorch Module and refer to the Pytorch documentation for all matter related to general usage
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
 
     Parameters:
@@ -1315,7 +1315,7 @@ def __init__(
             metadata_conditioning (`bool`, *optional*, defaults to `False`):
                 whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
             is_encoder (`bool`, *optional*, defaults to `False`):
-                _description_
+                Whether the model is an encoder only model.
         """
 
         super().__init__()
@@ -1378,7 +1378,7 @@ def forward(
             )
 
         target = tokens  # Target
-        hidden_states = self.embed_tokens(tokens)  # music_tokens embedding
+        hidden_states = self.embed_tokens(tokens)
         # Shift by 1, and fill in start token
         hidden_states = torch.cat((hidden_states[:, -1:], hidden_states[:, :-1]), dim=1)
         if self.metadata_conditioning:
@@ -1992,7 +1992,6 @@ def prior_postprocess(self, tokens):
         shared, prior_embed_dim_shift shifts the music token ids by nb_vocab. Only returns the music tokens.
         """
         batch_size = tokens.shape[0]
-        # dim (nb_lyric_tokens, codebook dim = latent_dim of the model)
         dims = (self.input_shapes[0], tokens.shape[1] - self.input_shapes[0])
         tokens = list(torch.split(tokens, dims, dim=1))
 
@@ -2457,9 +2456,10 @@ def _sample(
         the generated raw audio at each step.
 
         Args:
-           music_tokens (`List[torch.LongTensor] of length `self.levels` ) :
-                A sequence of music tokens which will be used as context to continue the sampling process. Should have
-                `self.levels` tensors, each corresponding to the generation at a certain level.
+           music_tokens (`List[torch.LongTensor]`):
+                A sequence of music tokens of length `self.levels` which will be used as context to continue the
+                sampling process. Should have `self.levels` tensors, each corresponding to the generation at a certain
+                level.
             labels (`List[torch.LongTensor]`):
                 List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
                 lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
diff --git a/tests/models/jukebox/test_modeling_jukebox.py b/tests/models/jukebox/test_modeling_jukebox.py
index 657acbfc1b653..9232119432f5a 100644
--- a/tests/models/jukebox/test_modeling_jukebox.py
+++ b/tests/models/jukebox/test_modeling_jukebox.py
@@ -28,7 +28,7 @@
 @require_torch
 class Jukebox1bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "jukebox-1b-lyrics"
+    model_id = "openai/jukebox-1b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -218,7 +218,7 @@ def test_vqvae(self):
 @require_torch
 class Jukebox5bModelTester(unittest.TestCase):
     all_model_classes = (JukeboxModel,) if is_torch_available() else ()
-    model_id = "jukebox-5b-lyrics"
+    model_id = "openai/jukebox-5b-lyrics"
     metas = dict(
         artist="Zac Brown Band",
         genres="Country",
@@ -245,7 +245,7 @@ class Jukebox5bModelTester(unittest.TestCase):
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
         653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653, 653,
-        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1237
+        1489, 1489, 1489, 1489, 1150, 1853, 1509, 1150, 1357, 1509, 6, 1272
     ]
 
     EXPECTED_OUTPUT_1 = [

From daa4cd492e98b8b6143299b3127dfba5adce9da9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 09:48:49 +0000
Subject: [PATCH 189/196] remove super init and clean

---
 src/transformers/models/jukebox/configuration_jukebox.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index ae11adc377d9c..21dccff917bce 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -240,7 +240,6 @@ class JukeboxPriorConfig(PretrainedConfig):
 
     model_type = "jukebox_prior"
     attribute_map = {
-        "hidden_size": "vqvae_codebook_dimension",
         "max_position_embeddings": "n_positions",
         "num_attention_heads": "n_head",
     }
@@ -291,7 +290,6 @@ def __init__(
         zero_out=False,
         **kwargs
     ):
-        super().__init__()
         self.act_fn = act_fn
         self.alignment_head = alignment_head
         self.alignment_layer = alignment_layer
@@ -450,7 +448,6 @@ def __init__(
         zero_out=False,
         **kwargs
     ):
-        super().__init__()
         self.hop_fraction = hop_fraction
         self.conv_input_shape = conv_input_shape
         self.sample_length = sample_length

From e6363603b28599f43a68bec93ec72f69ac7a4b68 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 10:19:34 +0000
Subject: [PATCH 190/196] fix doctest

---
 .../models/jukebox/modeling_jukebox.py        | 20 ++++++++++---------
 .../models/jukebox/tokenization_jukebox.py    |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 07d152936d068..e3b292fc8d9e6 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -2312,8 +2312,8 @@ def __init__(self, config):
 
     def set_shared_params(self, model_config):
         """
-        Initialises the parameters that are shared. This has to be done here because the list of PiroConfig is nest,
-        and is thus unreachable in the `from_dict` function
+        Initialises the parameters that are shared. This has to be done here because the list of `JukeboxPriorConfig`
+        is nest, and is thus unreachable in the `from_dict` function
         """
         for config in model_config.prior_configs:
             config.sampling_rate = model_config.sampling_rate
@@ -2456,7 +2456,7 @@ def _sample(
         the generated raw audio at each step.
 
         Args:
-           music_tokens (`List[torch.LongTensor]`):
+            music_tokens (`List[torch.LongTensor]`):
                 A sequence of music tokens of length `self.levels` which will be used as context to continue the
                 sampling process. Should have `self.levels` tensors, each corresponding to the generation at a certain
                 level.
@@ -2509,13 +2509,14 @@ def _sample(
         >>> labels = tokenizer(**metas)["input_ids"]
         >>> set_seed(0)
         >>> zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
-        >>> zs = model._sample(zs, labels, [2], sample_length=40 * model.priors[-1].raw_to_tokens, save_results=False)
-        >>> zs[-1]
+        >>> zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
+        >>> zs[0]
         tensor([[1853, 1369, 1150, 1869, 1379, 1789,  519,  710, 1306, 1100, 1229,  519,
               353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
              1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
              1804,  541, 1804, 1434]])
-        ```"""
+        ```
+        """
 
         top_prior = self.priors[0]
         if sample_length is not None:
@@ -2604,9 +2605,10 @@ def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> List[torch
 
         >>> with torch.no_grad():
         ...     model.decode(music_tokens)[:, :10].squeeze(-1)
-        tensor([[-0.0003, -0.0012,  0.0009,  0.0012,  0.0018,  0.0003, -0.0015, -0.0020,
-                 -0.0013,  0.0010]])
-        ```"""
+        tensor([[-0.0219, -0.0679, -0.1050, -0.1203, -0.1271, -0.0936, -0.0396, -0.0405,
+            -0.0818, -0.0697]])
+        ```
+        """
 
         sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
         music_tokens = [
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 13b17ddb969f5..01bada0e0806b 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -73,8 +73,8 @@ class JukeboxTokenizer(PreTrainedTokenizer):
     >>> from transformers import JukeboxTokenizer
     >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
     >>> tokenizer("Alan Jackson", "Country Rock", "old town road")['input_ids']
-    [tensor([[  0,   0,   0, 145,   0]]), tensor([[  0,   0,   0, 145,   0]]), tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
-               40,   76,   44,   41,   27,   30]])]
+    [tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
+               40,   76,   44,   41,   27,   30]]), tensor([[  0,   0,   0, 145,   0]]), tensor([[  0,   0,   0, 145,   0]])]
 
     ```
 

From 2a076fa5c9af73b3c4f356a4615dbfd985a7b158 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 12:18:39 +0000
Subject: [PATCH 191/196] nits

---
 .../models/jukebox/modeling_jukebox.py        | 107 +++++++++---------
 1 file changed, 54 insertions(+), 53 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index e3b292fc8d9e6..ef2da32fad411 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -41,11 +41,16 @@
 
 
 def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
-    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+
     Args:
-        logits: logits distribution shape (vocabulary size)
-        top_k >0: keep only top key tokens with highest probability (top-k filtering).
-        top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+        logits (`torch.Tensor`):
+            logits distribution shape (vocabulary size)
+        top_k (`int`, *optional*, defaults to 0):
+            When `top_k >0` keep only top key tokens with highest probability (top-k filtering).
+        top_p (`int`, *optional*, defaults to 0):
+            When  `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering).
     """
     logits = logits.clone()
     top_k = min(top_k, logits.size(-1))  # Safety check
@@ -195,11 +200,12 @@ def get_mask(mask, query_length, key_value_length, blocks, spread, device, sampl
         mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
     elif mask == "summary":
         # Masked summary
+        mask = torch.ones(query_length, query_length, device=device).tril()
+        mask = torch.ones(query_length, query_length, device=device).tril()
+        mask = mask.view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :]
         mask = (
             torch.nn.functional.pad(
-                torch.ones(query_length, query_length, device=device)
-                .tril()
-                .view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :],
+                mask,
                 (0, 0, 1, 0),
                 value=1,
             )
@@ -397,7 +403,6 @@ def _tile(self, hidden_states):
     def init_codebook(self, hidden_states):
         nb_discrete_codes = self.nb_discrete_codes
         self.init = True
-        # init k_w using random vectors from hidden_states codebook_w (index w?)
         codes = self._tile(hidden_states)
         self.codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes]
         self.codebook_sum = self.codebook
@@ -407,19 +412,18 @@ def update_codebook(self, hidden_states, latent_states):
         mu, codebook_width, nb_discrete_codes = self.mu, self.codebook_width, self.nb_discrete_codes
         with torch.no_grad():
             # Calculate new centres
-            latent_states_onehot = torch.zeros(
-                nb_discrete_codes, hidden_states.shape[0], device=hidden_states.device
-            )  # nb_discrete_codes, batch_size * L
+            # nb_discrete_codes, batch_size * seq_length
+            latent_states_onehot = torch.zeros(nb_discrete_codes, hidden_states.shape[0], device=hidden_states.device)
             latent_states_onehot.scatter_(0, latent_states.view(1, hidden_states.shape[0]), 1)
 
-            _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)  # nb_discrete_codes, w
+            _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)
             _codebook_elem = latent_states_onehot.sum(dim=-1)  # nb_discrete_codes
             codes = self._tile(hidden_states)
             _random_codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes]
 
             # Update centres
             old_codebook = self.codebook
-            self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum  # w, nb_discrete_codes
+            self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum
             self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem  # nb_discrete_codes
             usage = (self.codebook_elem.view(nb_discrete_codes, 1) >= self.threshold).float()
             self.codebook = (
@@ -430,9 +434,7 @@ def update_codebook(self, hidden_states, latent_states):
                 )
                 + (1 - usage) * _random_codebook
             )
-            _codebook_prob = _codebook_elem / torch.sum(
-                _codebook_elem
-            )  # latent_states_onehot.mean(dim=-1)  # prob of each bin
+            _codebook_prob = _codebook_elem / torch.sum(_codebook_elem)  # prob of each bin
             entropy = -torch.sum(_codebook_prob * torch.log(_codebook_prob + 1e-8))  # entropy ie how diverse
             used_curr = (_codebook_elem >= self.threshold).sum()
             usage = torch.sum(usage)
@@ -1242,10 +1244,12 @@ def __init__(self, config, n_ctx):
 
     def set_record_attn(self, record_attn):
         """
-        Arguments:
-            record_attn (bool or set): Makes forward prop dump self-attention
-                softmaxes to self.saved_attn_weights. Either a set of layer indices indicating which layers to store,
-                or a boolean value indicating whether to dump all.
+        Makes forward prop dump self-attention softmaxes to self.saved_attn_weights.
+
+        Args:
+            record_attn (`Union[bool,set]`):
+                Either a set of layer indices indicating which layers to store,
+                or a boolean value indicating Whether to dump all.
         """
 
         def _should_record_attn(layer_idx):
@@ -1298,7 +1302,8 @@ def __init__(
         is_encoder=False,
     ):
         """
-        Autoregressive model.
+        Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern
+        should be properly set fro each configuration.
 
         Args:
             config (`JukeboxPriorConfig`):
@@ -1306,14 +1311,14 @@ def __init__(
                 not load the weights associated with the model, only the configuration. Check out the
                 [`~PreTrainedModel.from_pretrained`] method to load the model weights.
             n_ctx (`int`, *optional*):
-                number of tokens or lyrics tokens provided in a single pass.
+                Number of tokens or lyrics tokens provided in a single pass.
             embed_dim (`int`, *optional*):
-                either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
+                Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codeboook dimension,
                 if the model combines lyrics and music tokens, or simply n_vocab if the model is a seperate encoder
             audio_conditioning (`bool`, *optional*, defaults to `False`):
-                whether or not the prior supports conditionning on audio.
+                Whether or not the prior supports conditionning on audio.
             metadata_conditioning (`bool`, *optional*, defaults to `False`):
-                whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
+                Whether or not the prior supports conditionning on artitst, genres, lyrics and timing.
             is_encoder (`bool`, *optional*, defaults to `False`):
                 Whether the model is an encoder only model.
         """
@@ -1362,8 +1367,8 @@ def forward(
     ):
         """
         Args:
-        tokens : composed of both music tokens and lyrics tokens or just music tokens
-        depending on the `merged_decoder` flag.
+            tokens (`torch.tensor`):
+                Can represent music tokens, lyrics tokens or both, depending on the configuration.
         """
         # Preprocess.
         batch_size = tokens.shape[0]
@@ -1641,7 +1646,7 @@ def __init__(self, config, level):
 
     def forward(self, music_tokens, raw_audio_conditionning=None):
         """
-        Args :
+        Args:
             music_tokens (`torch.LongTensor`):
                 Music tokens form the uper level in range(nb_discrete_codes)
             raw_audio_conditionning (`torch.LongTensor`, *optional*):
@@ -1663,14 +1668,17 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
 
 
 class JukeboxRangeEmbedding(nn.Module):
-    # Interpolating
-    # Interpolate so that [pos_start, pos_end] <-> position tensor of length n_ctx
-    #
-    # Binning
-    # For each pos in position tensor, find its bin
-    # [start,end) mapped to [0,1,...,bins-1]
-    # [start,end) -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1]
-    # NOTE: Open ended interval on right, so start <= pos < end, not <= end
+    """
+    The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of
+    time positional embedding of length `n_ctx`.
+
+    Binning process :
+    For each pos in position tensor, find its bin
+    [start,end) mapped to [0,1,...,bins-1]
+    [start,end) -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1]
+    NOTE: Open ended interval on right, so start <= pos < end, not <= end
+    """
+
     def __init__(self, n_time, embed_dim, range, out_width, clamp=False):
         super().__init__()
         self.n_time = n_time
@@ -1722,7 +1730,7 @@ def __init__(self, config, include_time_signal):
         self.max_nb_genres = config.max_nb_genres
         self.bow_genre_emb = nn.Embedding(nb_genres, embed_dim)
         self.artist_emb = nn.Embedding(nb_artists, embed_dim)
-        self.include_time_signal = include_time_signal  # add to config
+        self.include_time_signal = include_time_signal
         if self.include_time_signal:
             total_length_range = (config.min_duration * sampling_rate, config.max_duration * sampling_rate)
             absolute_pos_range = (0.0, config.max_duration * sampling_rate)
@@ -1778,7 +1786,7 @@ class JukeboxPrior(PreTrainedModel):
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
         level (`int`, *optional*):
-            Current level of the Prior. Should be in range `0,nb_priors`.
+            Current level of the Prior. Should be in range `[0,nb_priors]`.
         nb_priors (`int`, *optional*, defaults to 3):
             Total number of priors.
         vqvae_encoder (`Callable`, *optional*):
@@ -1795,7 +1803,7 @@ class JukeboxPrior(PreTrainedModel):
     def _init_weights(self, module):
         init_scale = self.config.init_scale
 
-        if isinstance(module, nn.Embedding):  # embed_tokens
+        if isinstance(module, nn.Embedding):
             module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
         elif isinstance(module, JukeboxConv1D):
             if self.config.zero_out:
@@ -1830,7 +1838,7 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en
         self.level = level if level is not None else config.level
 
         self.base_model_prefix = f"priors.{self.level}"
-        self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level]
+        self._keys_to_ignore_on_load_unexpected = ["vqvae", r"priors.[^%d]." % self.level]
 
         self.n_ctx = config.n_ctx
 
@@ -1847,7 +1855,6 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en
         # metadata conditioning : contioning on timing, genres, and artist
         self.metadata_conditioning = config.metadata_conditioning
         if self.metadata_conditioning:
-            # Assuming STFT=TF order and raw=T1 order, so Time is first dim
             self.metadata_embedding = LabelConditioner(config, include_time_signal=not self.audio_conditioning)
 
         # define encoder-decoder or encoder and decoder
@@ -1870,7 +1877,6 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en
 
         else:
             # Separate encoder-decoder transformer
-            #  we have to modify the config to use the encoder variables for the lyric encoder
             encoder_config = config.encoder_config
 
             if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
@@ -1919,7 +1925,7 @@ def get_metadata(self, labels, start, total_length, offset, get_indices=False):
 
         # Set offset
         metadata[:, 1:2] = int(offset * self.raw_to_tokens) + int(start * self.raw_to_tokens)
-        # here since metadata has the full token_list, ze just need to selected the ones that are relevant
+        # here since metadata has the full token_list, we just need to selected the ones that are relevant
 
         # Set lyric tokens
         metadata, indices = self.set_metadata_lyric_tokens(metadata)
@@ -1972,7 +1978,7 @@ def get_music_tokens_conds(self, music_tokens, start, end):
     def prior_preprocess(self, tokens, conds):
         """
         Shifts the input tokens to account for the dictionnary merge. The embed_dim_shift give by how much the music
-        tokens should be shifted by. It is equal to lyric_vocab_size.
+        tokens should be shifted by. It is equal to `lyric_vocab_size`.
         """
         batch_size = tokens[0].shape[0]
         for i in range(len(tokens)):
@@ -1989,7 +1995,7 @@ def prior_preprocess(self, tokens, conds):
     def prior_postprocess(self, tokens):
         """
         Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
-        shared, prior_embed_dim_shift shifts the music token ids by nb_vocab. Only returns the music tokens.
+        shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music tokens.
         """
         batch_size = tokens.shape[0]
         dims = (self.input_shapes[0], tokens.shape[1] - self.input_shapes[0])
@@ -2013,7 +2019,6 @@ def embed_tokens(self, music_tokens_conds):
             audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning)
         return audio_conditioning
 
-    # Used in the forward pass
     def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
         """
         Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states.
@@ -2188,11 +2193,6 @@ def forward_tokens(
         """
         Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the
         vqvae's encoding layers.
-
-        Args:
-            get_attn_weights (bool or set): Makes forward prop dump
-                self-attention softmaxes to self.prior.transformer.saved_attn_weights. Either a set of layer indices
-                indicating which layers to store, or a boolean value indicating whether to dump all.
         """
         if get_attn_weights:
             self.prior.transformer.set_record_attn(get_attn_weights)
@@ -2557,8 +2557,9 @@ def _sample(
                 self.vqvae.to(music_tokens[level].device)
                 # Decode sample
                 with torch.no_grad():
+                    start_level = len(self.priors) - level - 1  #  vqvae levels are reversed
                     raw_audio = self.vqvae.decode(
-                        music_tokens[: level + 1], start_level=level, bs_chunks=music_tokens[level].shape[0]
+                        music_tokens[: level + 1], start_level=start_level, bs_chunks=music_tokens[level].shape[0]
                     )
                 logdir = f"jukebox/level_{level}"
                 if not os.path.exists(logdir):
@@ -2566,7 +2567,7 @@ def _sample(
                 save_temp_audio(logdir, level, metas=metas, aud=raw_audio.float())
                 if compute_alignments and self.priors[0] is not None and self.priors[0].nb_relevant_lyric_tokens > 0:
                     with torch.no_grad():
-                        alignments = get_alignment(music_tokens, labels[-1], self.priors[0], self.config)
+                        alignments = get_alignment(music_tokens, labels[0], self.priors[0], self.config)
                     torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")
 
         return music_tokens

From 415206ceff4d5bf2c86e8995dfb4a22a77057f18 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 13:19:00 +0000
Subject: [PATCH 192/196] last fixup

---
 .../models/jukebox/modeling_jukebox.py        | 29 +++++++++----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index ef2da32fad411..59586ca620a22 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -50,7 +50,7 @@ def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
         top_k (`int`, *optional*, defaults to 0):
             When `top_k >0` keep only top key tokens with highest probability (top-k filtering).
         top_p (`int`, *optional*, defaults to 0):
-            When  `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering).
+            When `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering).
     """
     logits = logits.clone()
     top_k = min(top_k, logits.size(-1))  # Safety check
@@ -1248,8 +1248,8 @@ def set_record_attn(self, record_attn):
 
         Args:
             record_attn (`Union[bool,set]`):
-                Either a set of layer indices indicating which layers to store,
-                or a boolean value indicating Whether to dump all.
+                Either a set of layer indices indicating which layers to store, or a boolean value indicating Whether
+                to dump all.
         """
 
         def _should_record_attn(layer_idx):
@@ -1302,8 +1302,8 @@ def __init__(
         is_encoder=False,
     ):
         """
-        Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern
-        should be properly set fro each configuration.
+        Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
+        set fro each configuration.
 
         Args:
             config (`JukeboxPriorConfig`):
@@ -1669,14 +1669,12 @@ def forward(self, music_tokens, raw_audio_conditionning=None):
 
 class JukeboxRangeEmbedding(nn.Module):
     """
-    The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of
-    time positional embedding of length `n_ctx`.
-
-    Binning process :
-    For each pos in position tensor, find its bin
-    [start,end) mapped to [0,1,...,bins-1]
-    [start,end) -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1]
-    NOTE: Open ended interval on right, so start <= pos < end, not <= end
+    The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of time positional
+    embedding of length `n_ctx`.
+
+    Binning process : For each pos in position tensor, find its bin [start,end) mapped to [0,1,...,bins-1] [start,end)
+    -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1] NOTE: Open ended interval on right, so start <= pos < end, not <=
+    end
     """
 
     def __init__(self, n_time, embed_dim, range, out_width, clamp=False):
@@ -1995,7 +1993,8 @@ def prior_preprocess(self, tokens, conds):
     def prior_postprocess(self, tokens):
         """
         Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
-        shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music tokens.
+        shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music
+        tokens.
         """
         batch_size = tokens.shape[0]
         dims = (self.input_shapes[0], tokens.shape[1] - self.input_shapes[0])
@@ -2557,7 +2556,7 @@ def _sample(
                 self.vqvae.to(music_tokens[level].device)
                 # Decode sample
                 with torch.no_grad():
-                    start_level = len(self.priors) - level - 1  #  vqvae levels are reversed
+                    start_level = len(self.priors) - level - 1  # vqvae levels are reversed
                     raw_audio = self.vqvae.decode(
                         music_tokens[: level + 1], start_level=start_level, bs_chunks=music_tokens[level].shape[0]
                     )

From d2a5261c63610e0fbc6c8b3344b4b9fd2ca244a0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 15:35:40 +0000
Subject: [PATCH 193/196] code reviews

---
 docs/source/en/model_doc/jukebox.mdx            | 16 ++++++++--------
 .../models/jukebox/modeling_jukebox.py          | 17 +++++++----------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 2351e8e84b70f..6c29e41568c3d 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -41,6 +41,14 @@ The original code can be found [here](https://github.com/openai/jukebox).
 
 [[autodoc]] JukeboxConfig
 
+## JukeboxPriorConfig
+
+[[autodoc]] JukeboxPriorConfig
+
+## JukeboxVQVAEConfig
+
+[[autodoc]] JukeboxVQVAEConfig
+
 ## JukeboxModel
 
 [[autodoc]] JukeboxModel
@@ -55,9 +63,6 @@ The original code can be found [here](https://github.com/openai/jukebox).
 [[autodoc]] JukeboxTokenizer
     - save_vocabulary
 
-## JukeboxPriorConfig
-
-[[autodoc]] JukeboxPriorConfig
 
 ## JukeboxPrior
 
@@ -66,11 +71,6 @@ The original code can be found [here](https://github.com/openai/jukebox).
     - forward
 
 
-## JukeboxVQVAEConfig
-
-[[autodoc]] JukeboxVQVAEConfig
-
-
 ## JukeboxVQVAE
 
 [[autodoc]] JukeboxVQVAE
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 59586ca620a22..956260a25c685 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -426,14 +426,11 @@ def update_codebook(self, hidden_states, latent_states):
             self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum
             self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem  # nb_discrete_codes
             usage = (self.codebook_elem.view(nb_discrete_codes, 1) >= self.threshold).float()
-            self.codebook = (
-                usage
-                * (
-                    self.codebook_sum.view(nb_discrete_codes, codebook_width)
-                    / self.codebook_elem.view(nb_discrete_codes, 1)
-                )
-                + (1 - usage) * _random_codebook
+
+            norm_code = self.codebook_sum.view(nb_discrete_codes, codebook_width) / self.codebook_elem.view(
+                nb_discrete_codes, 1
             )
+            self.codebook = usage * (norm_code) + (1 - usage) * _random_codebook
             _codebook_prob = _codebook_elem / torch.sum(_codebook_elem)  # prob of each bin
             entropy = -torch.sum(_codebook_prob * torch.log(_codebook_prob + 1e-8))  # entropy ie how diverse
             used_curr = (_codebook_elem >= self.threshold).sum()
@@ -1715,7 +1712,7 @@ def forward(self, pos_start, pos_end=None):
         return self.emb(bins_)
 
 
-class LabelConditioner(nn.Module):
+class JukeboxLabelConditioner(nn.Module):
     def __init__(self, config, include_time_signal):
         super().__init__()
 
@@ -1836,7 +1833,7 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en
         self.level = level if level is not None else config.level
 
         self.base_model_prefix = f"priors.{self.level}"
-        self._keys_to_ignore_on_load_unexpected = ["vqvae", r"priors.[^%d]." % self.level]
+        self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level]
 
         self.n_ctx = config.n_ctx
 
@@ -1853,7 +1850,7 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en
         # metadata conditioning : contioning on timing, genres, and artist
         self.metadata_conditioning = config.metadata_conditioning
         if self.metadata_conditioning:
-            self.metadata_embedding = LabelConditioner(config, include_time_signal=not self.audio_conditioning)
+            self.metadata_embedding = JukeboxLabelConditioner(config, include_time_signal=not self.audio_conditioning)
 
         # define encoder-decoder or encoder and decoder
         self.is_encoder_decoder = config.is_encoder_decoder

From e492cab508bb4dd4adf0e070df197efc7977cc8c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 18:18:11 +0000
Subject: [PATCH 194/196] remove lambda in ATTENTION_PATTERNS

---
 .../models/jukebox/configuration_jukebox.py   | 33 +++++++++++++------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 21dccff917bce..888ab20ffa4a8 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -114,17 +114,30 @@
 _FullDenseAttention = ["dense_attention"]
 _PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"]
 
+
+def FullDenseAttention(layer):
+    return _FullDenseAttention[0]
+
+
+def RawColumnPreviousRowAttention(layer):
+    return _RawColumnPreviousRowAttention[layer % 3]
+
+
+def large_separated_enc_dec_w_lyrics(layer):
+    return _LARGE_ATTENTION[layer % 79]
+
+
+def enc_dec_with_lyrics(layer):
+    if layer % 16 == 15:
+        return _PrimePrimeDenseAttention[layer % 3]
+    return _RawColumnPreviousRowAttention[layer % 3]
+
+
 ATTENTION_PATTERNS = {
-    "FullDenseAttention": lambda layer: _FullDenseAttention[0],
-    "RawColumnPreviousRowAttention": lambda layer: _RawColumnPreviousRowAttention[
-        layer % 3
-    ],  # Alternate row, column and previous row attn
-    "large_separated_enc_dec_w_lyrics": lambda layer: _LARGE_ATTENTION[
-        layer % 79
-    ],  # Used by large separated_enc_dec model with lyrics
-    "enc_dec_with_lyrics": lambda layer: _PrimePrimeDenseAttention[layer % 3]
-    if layer % 16 == 15
-    else _RawColumnPreviousRowAttention[layer % 3],  # Used by encoder_decoder model with lyrics
+    "FullDenseAttention": FullDenseAttention,
+    "RawColumnPreviousRowAttention": RawColumnPreviousRowAttention,  # Alternate row, column and previous row attn
+    "large_separated_enc_dec_w_lyrics": large_separated_enc_dec_w_lyrics,  # Used by large separated_enc_dec model with lyrics
+    "enc_dec_with_lyrics": enc_dec_with_lyrics,  # Used by encoder_decoder model with lyrics
 }
 
 

From 9cbd46276740558ab4aef9ae79bf5c382621648e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 19:12:21 +0000
Subject: [PATCH 195/196] update attention pattern!!

---
 src/transformers/models/jukebox/configuration_jukebox.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 888ab20ffa4a8..6ce345a8578e2 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -115,11 +115,11 @@
 _PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"]
 
 
-def FullDenseAttention(layer):
+def full_dense_attention(layer):
     return _FullDenseAttention[0]
 
 
-def RawColumnPreviousRowAttention(layer):
+def raw_column_previous_row_attention(layer):
     return _RawColumnPreviousRowAttention[layer % 3]
 
 
@@ -134,8 +134,8 @@ def enc_dec_with_lyrics(layer):
 
 
 ATTENTION_PATTERNS = {
-    "FullDenseAttention": FullDenseAttention,
-    "RawColumnPreviousRowAttention": RawColumnPreviousRowAttention,  # Alternate row, column and previous row attn
+    "full_dense_attention": full_dense_attention,
+    "raw_column_previous_row_attention": raw_column_previous_row_attention,  # Alternate row, column and previous row attn
     "large_separated_enc_dec_w_lyrics": large_separated_enc_dec_w_lyrics,  # Used by large separated_enc_dec model with lyrics
     "enc_dec_with_lyrics": enc_dec_with_lyrics,  # Used by encoder_decoder model with lyrics
 }

From e6ef5352f5795d03299dba0e20d77f81417a3bf8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 10 Nov 2022 19:12:36 +0000
Subject: [PATCH 196/196] tokenizer before model

---
 docs/source/en/model_doc/jukebox.mdx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/model_doc/jukebox.mdx b/docs/source/en/model_doc/jukebox.mdx
index 6c29e41568c3d..860fb8fc3f67b 100644
--- a/docs/source/en/model_doc/jukebox.mdx
+++ b/docs/source/en/model_doc/jukebox.mdx
@@ -49,6 +49,11 @@ The original code can be found [here](https://github.com/openai/jukebox).
 
 [[autodoc]] JukeboxVQVAEConfig
 
+## JukeboxTokenizer
+
+[[autodoc]] JukeboxTokenizer
+    - save_vocabulary
+
 ## JukeboxModel
 
 [[autodoc]] JukeboxModel
@@ -58,11 +63,6 @@ The original code can be found [here](https://github.com/openai/jukebox).
     - upsample
     - _sample
 
-## JukeboxTokenizer
-
-[[autodoc]] JukeboxTokenizer
-    - save_vocabulary
-
 
 ## JukeboxPrior