From cd94d036d64952b082fb6552b2dd97d140eea649 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 19 Sep 2022 06:36:09 +0000
Subject: [PATCH 001/156] simplify loop

---
 src/transformers/modeling_tf_utils.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index bbcbf125d8d4f..aa83e560b9fb8 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2151,17 +2151,24 @@ def save_pretrained(
             )
             for shard_file, shard in shards.items():
                 with h5py.File(os.path.join(save_directory, shard_file), mode="w") as shard_file:
-                    save_attributes_to_hdf5_group(
-                        shard_file,
-                        "layer_names",
-                        ["/".join(layer.name.split("/")[1:]).encode("utf8") for layer in shard],
-                    )
-
+                    layers = []
                     for layer in sorted(shard, key=lambda x: x.name):
+                        if "embed_tokens" in layer.name: 
+                            layer_name = layer.name
+                        if "model." in layer.name : 
+                            layer_name = layer.name
+                        else:
+                            layer_name = "/".join(layer.name.split("/")[1:])
                         param_dset = shard_file.create_dataset(
-                            "/".join(layer.name.split("/")[1:]), layer.numpy().shape, dtype=layer.numpy().dtype
+                                layer_name, layer.numpy().shape, dtype=layer.numpy().dtype
                         )
                         param_dset[:] = layer.numpy()
+                        layers.append(layer_name.encode("utf8"))
+                    save_attributes_to_hdf5_group(
+                        shard_file,
+                        "layer_names",
+                        layers,
+                    )
 
         if push_to_hub:
             self._upload_modified_files(

From 46b0ebe8ab4b425b7e9b58502015a724c9fc7a9a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 08:34:08 +0000
Subject: [PATCH 002/156] add featur extractor

---
 .../whisper/feature_extraction_whisper.py     | 229 ++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 src/transformers/models/whisper/feature_extraction_whisper.py

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
new file mode 100644
index 0000000000000..ffea108a6b50c
--- /dev/null
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -0,0 +1,229 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for Whisper
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import torchaudio.compliance.kaldi as ta_kaldi
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class WhisperFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a Whisper feature extractor.
+
+    This feature extractor inherits from [`WhisperFeatureExtractor`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral
+    mean and variance normalization to the extracted features.
+
+    Args:
+        feature_size (`int`, defaults to 80):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
+        num_mel_bins (`int`, defaults to 80):
+            Number of Mel-frequency bins.
+        padding_value (`float`, defaults to 0.0):
+            The value that is used to fill the padding vectors.
+        do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
+        normalize_means (`bool`, *optional*, defaults to `True`):
+            Whether or not to zero-mean normalize the extracted features.
+        normalize_vars (`bool`, *optional*, defaults to `True`):
+            Whether or not to unit-variance normalize the extracted features.
+    """
+
+    model_input_names = ["input_features", "attention_mask"]
+
+    def __init__(
+        self,
+        mel_filter_file,
+        feature_size=80,
+        sampling_rate=16000,
+        num_mel_bins=80,
+        hop_length=160,
+        chunk_length=30,
+        n_fft=400,
+        padding_value=0.0,
+        **kwargs
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.num_mel_bins = num_mel_bins
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.chunk_length = chunk_length
+        self.return_attention_mask = True
+
+        with np.load(mel_filter_file) as f:
+            self.mel_filters = torch.from_numpy(f[f"mel_{self.num_mel_bins}"])
+
+    def _extract_fbank_features(
+        self,
+        waveform: np.ndarray,
+    ) -> np.ndarray:
+        """
+        Compute the log-Mel spectrogram of the provided audio
+        """
+        waveform = torch.from_numpy(waveform)
+        window = torch.hann_window(self.n_fft).to(waveform.device)
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[:, :-1].abs() ** 2
+
+        filters = self.mel_filters
+        mel_spec = filters @ magnitudes
+
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s). sequences.
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For WhisperTransoformer models, `attention_mask` should alwys be passed for batched inference, to avoid
+                subtle bugs.
+
+                </Tip>
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
+                The value that is used to fill the padding values / vectors.
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched = bool(
+            isinstance(raw_speech, (list, tuple))
+            and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [raw_speech]
+
+        # extract fbank features
+        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchFeature({"input_features": features})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features")
+        if isinstance(input_features[0], list):
+            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+    def save_pretrained(self, pretrained_model_name_or_path, **kwargs):
+        super().save_pretrained(pretrained_model_name_or_path)
+        np.savez_compressed("mel_filters.npz", mel_80=self.mel_filters)

From af9d14f3809ca2f12986abb723ead77288065afb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 09:25:24 +0000
Subject: [PATCH 003/156] add model

---
 .../whisper/feature_extraction_whisper.py     |    1 -
 .../models/whisper/modeling_whisper.py        | 1461 +++++++++++++++++
 2 files changed, 1461 insertions(+), 1 deletion(-)
 create mode 100644 src/transformers/models/whisper/modeling_whisper.py

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index ffea108a6b50c..24dfcb34e437f 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -20,7 +20,6 @@
 
 import numpy as np
 import torch
-import torchaudio.compliance.kaldi as ta_kaldi
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
new file mode 100644
index 0000000000000..8f1fc405dcabe
--- /dev/null
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -0,0 +1,1461 @@
+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Whisper model."""
+
+
+import math
+import random
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_whisper import WhisperConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "WhisperConfig"
+
+
+WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "openai/whisper-base",
+    # See all Whisper models at https://huggingface.co/models?filter=whisper
+]
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+class Conv1dSubsampler(nn.Module):
+    """
+    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
+    via gated linear units (https://arxiv.org/abs/1911.08460)
+    """
+
+    def __init__(self, config):
+        super(Conv1dSubsampler, self).__init__()
+        self.config = config
+        self.num_layers = config.num_conv_layers
+        self.in_channels = config.input_feat_per_channel * config.input_channels
+        self.mid_channels = config.conv_channels
+        self.out_channels = config.d_model
+        self.kernel_sizes = config.conv_kernel_sizes
+
+        self.conv_layers = nn.ModuleList(
+            nn.nn.Conv1d(
+                self.in_channels if i == 0 else self.mid_channels // 2,
+                self.mid_channels if i < self.num_layers - 1 else self.out_channels * 2,
+                kernel_size=k,
+                stride=2,
+                padding=k // 2,
+            )
+            for i, k in enumerate(self.kernel_sizes)
+        )
+
+    def forward(self, input_features):
+        hidden_states = input_features.transpose(1, 2).contiguous()  # -> B x (C x D) x T
+        for conv in self.conv_layers:
+            hidden_states = conv(hidden_states)
+            hidden_states = nn.functional.glu(hidden_states, dim=1)
+        hidden_states = hidden_states.transpose(1, 2).contiguous()  # -> T x B x (C x D)
+        return hidden_states
+
+
+# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextSinusoidalPositionalEmbedding with Speech2Text->Whisper
+class WhisperSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.weights = nn.Parameter(emb_weights)
+        self.weights.requires_grad = False
+        self.weights.detach_()
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+
+    @torch.no_grad()
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        bsz, seq_len = input_ids.size()
+        # Create the position ids from the input token ids. Any padded tokens remain padded.
+        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+            input_ids.device
+        )
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
+
+    def create_position_ids_from_input_ids(
+        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
+    ):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            x: torch.Tensor x:
+        Returns: torch.Tensor
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        mask = input_ids.ne(padding_idx).int()
+        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+        return incremental_indices.long() + padding_idx
+
+
+class WhisperPositionalEmbedding(nn.Embedding):
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__(num_positions, embedding_dim)
+
+    def forward(self, input_ids, past_key_values_length=0):
+
+        return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]]
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Whisper
+class WhisperAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class WhisperResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
+        super().__init__()
+
+        self.attn = WhisperAttention(n_state, n_head)
+        self.attn_ln = nn.LayerNorm(n_state)
+
+        self.cross_attn = WhisperAttention(n_state, n_head) if cross_attention else None
+        self.cross_attn_ln = nn.LayerNorm(n_state) if cross_attention else None
+
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(nn.Linear(n_state, n_mlp), nn.GELU(), nn.Linear(n_mlp, n_state))
+        self.mlp_ln = nn.LayerNorm(n_state)
+
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+
+
+class WhisperEncoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.activation_fn = ACT2FN[config.activation_function]
+
+        self.fc1 = nn.Linear(self.embed_dim, 4 * self.embed_dim)
+        self.fc2 = nn.Linear(4 * self.embed_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WhisperDecoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            is_decoder=True,
+        )
+        self.activation_fn = ACT2FN[config.activation_function]
+
+        self.self_attn_layer_norm = nn.nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = WhisperAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.nn.Linear(self.embed_dim, 4 * self.embed_dim)
+        self.fc2 = nn.nn.Linear(4 * self.embed_dim, self.embed_dim)
+        self.final_layer_norm = nn.nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size *(decoder_attention_heads,)*.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextPreTrainedModel with Speech2Text->Whisper
+class WhisperPreTrainedModel(PreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix = "model"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.nn.Linear, nn.nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (WhisperDecoder, WhisperEncoder)):
+            module.gradient_checkpointing = value
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        for i in range(self.config.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
+        # generate creates 3D attention mask, because of the shape of input_features
+        # convert it to 2D if thats the case
+        if len(attention_mask.shape) > 2:
+            attention_mask = attention_mask[:, :, -1]
+
+        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+        bsz = attention_mask.size()[0]
+        attention_mask = torch.zeros(
+            (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+
+        # these two operations makes sure that all values
+        # before the output lengths indices are attended to
+        attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()
+        return attention_mask
+
+
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`WhisperConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
+            Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
+            by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
+            via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`SpeechToTextTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            SpeechToText uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read
+            [`modeling_whisper._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the
+            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`. decoder_inputs_embeds (`torch.FloatTensor` of
+            shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+            `decoder_input_ids` you can choose to directly pass an embedded representation. If `past_key_values` is
+            used, optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is
+            useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextEncoder with Speech2Text->Whisper
+class WhisperEncoder(WhisperPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`WhisperEncoderLayer`].
+
+    Args:
+        config: WhisperConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.conv1 = nn.Conv1d(self.n_mels, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+
+        self.embed_positions = WhisperSinusoidalPositionalEmbedding(
+            self.max_source_positions,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
+                Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting the fbank features,
+                padding and conversion into a tensor of type `torch.FloatTensor`. See
+                [`~WhisperFeatureExtractor.__call__`]
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
+                `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        inputs_embeds = F.gelu(self.conv1(input_features))
+        inputs_embeds = F.gelu(self.conv2(input_features))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        # subsample attention mask if necessary
+        if attention_mask is not None:
+            attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
+            padding_mask = attention_mask.ne(1).long()
+        else:
+            padding_mask = torch.zeros(inputs_embeds.shape[:2], dtype=torch.long, device=inputs_embeds.device)
+
+        embed_pos = self.embed_positions(padding_mask)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )  # TODO should we remove all dropout?
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class WhisperDecoder(WhisperPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`WhisperDecoderLayer`]
+
+    Args:
+        config: WhisperConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Linear(config.vocab_size, config.d_model, self.padding_idx, bias=False)
+
+        self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model)
+
+        self.layers = nn.ModuleList([WhisperDecoderLayer(config) for _ in range(config.decoder_layers)])
+
+        self.layer_norm = nn.nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(inputs_embeds.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )  # TODO should we remove all of em?
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (len(self.layers)), (
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
+                        " False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
+    WHISPER_START_DOCSTRING,
+)
+# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextModel with Speech2Text->Whisper,SPEECH_TO_TEXT->WHISPER,facebook/s2t-small-librispeech-asr->openai/whisper-base
+class WhisperModel(WhisperPreTrainedModel):
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        self.decoder = WhisperDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def set_output_embeddings(self, new_embeddings):
+        self.decoder.embed_tokens = new_embeddings
+
+    def get_output_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+         ```python
+         >>> import torch
+         >>> from transformers import WhisperModel, WhisperFeatureExtractor
+         >>> from datasets import load_dataset
+
+         >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+         >>> inputs = feature_extractor(
+         ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
+         ... )
+         >>> input_features = inputs.input_features
+         >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+         >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+         >>> list(last_hidden_state.shape)
+         [1, 2, 256]
+         ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # downsample encoder attention mask
+        if attention_mask is not None:
+            encoder_attention_mask = self._get_feature_vector_attention_mask(
+                encoder_outputs[0].shape[1], attention_mask
+            )
+        else:
+            encoder_attention_mask = None
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        decoder_outputs = self.decoder.embed_tokens(decoder_outputs)
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Whisper Model with a language modeling head. Can be used for summarization.",
+    WHISPER_START_DOCSTRING,
+)
+# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextForConditionalGeneration with Speech2Text->Whisper,SPEECH_TO_TEXT->WHISPER,facebook/s2t-small-librispeech-asr->openai/whisper-base
+class WhisperForConditionalGeneration(WhisperPreTrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"encoder.version",
+        r"decoder.version",
+        r"model.encoder.embed_positions.weights",
+        r"model.decoder.embed_positions.weights",
+    ]
+    _keys_to_ignore_on_save = [
+        r"model.encoder.embed_positions.weights",
+        r"model.decoder.embed_positions.weights",
+    ]
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.model = WhisperModel(config)
+        self.lm_head = nn.nn.Linear(config.d_model, self.config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        return new_embeddings
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import WhisperProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> inputs = processor(
+        ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
+        ... )
+        >>> input_features = inputs.input_features
+
+        >>> generated_ids = model.generate(inputs=input_features)
+
+        >>> transcription = processor.batch_decode(generated_ids)[0]
+        >>> transcription
+        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_features,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past

From 00cdcbe99cab8d63cd26d464d663785cc4e5ac2d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 09:58:53 +0000
Subject: [PATCH 004/156] start conversion

---
 .../whisper/convert_openai_whisper_to_tfms.py | 149 ++++++++++++++++++
 .../models/whisper/modeling_whisper.py        |  39 ++---
 2 files changed, 169 insertions(+), 19 deletions(-)
 create mode 100644 src/transformers/models/whisper/convert_openai_whisper_to_tfms.py

diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
new file mode 100644
index 0000000000000..8de6af914c235
--- /dev/null
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -0,0 +1,149 @@
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+from torch import nn
+
+from transformers import WhisperConfig, WhisperForConditionalGeneration
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = [
+        "encoder.version",
+        "decoder.version",
+        "model.encoder.version",
+        "model.decoder.version",
+        "decoder.output_projection.weight",
+        "_float_tensor",
+        "encoder.embed_positions._float_tensor",
+        "decoder.embed_positions._float_tensor",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_keys(s_dict):
+    keys = list(s_dict.keys())
+    for key in keys:
+        if "blocks" in key:
+            s_dict[key.replace("blocks", "layers")] = s_dict.pop(key)
+            s_dict[key.replace("attn.query", "self_attn.q_proj")] = s_dict.pop(key)
+            s_dict[key.replace("attn.key", "self_k.q_proj")] = s_dict.pop(key)
+            s_dict[key.replace("attn.value", "self_attn.v_proj")] = s_dict.pop(key)
+
+            "mlp.0":"fc1"
+            "mlp:2":"fc2"
+            "attn_ln":"self_attn_layer_norm"
+            "attn.out":"self_attn.out_proj"
+            "mlp_ln":"final_layer_norm"
+
+
+        elif "subsample" in key:
+            s_dict[key.replace("subsample", "conv")] = s_dict.pop(key)
+
+
+def make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+
+
+def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
+    m2m_100 = torch.load(checkpoint_path, map_location="cpu")
+    args = m2m_100["args"]
+    state_dict = m2m_100["model"]
+    lm_head_weights = state_dict["decoder.output_projection.weight"]
+
+    remove_ignore_keys_(state_dict)
+    rename_keys(state_dict)
+
+    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
+
+    tie_embeds = args.share_decoder_input_output_embed
+
+    conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")]
+    config = WhisperConfig(
+        vocab_size=vocab_size,
+        max_source_positions=args.max_source_positions,
+        max_target_positions=args.max_target_positions,
+        encoder_layers=args.encoder_layers,
+        decoder_layers=args.decoder_layers,
+        encoder_attention_heads=args.encoder_attention_heads,
+        decoder_attention_heads=args.decoder_attention_heads,
+        encoder_ffn_dim=args.encoder_ffn_embed_dim,
+        decoder_ffn_dim=args.decoder_ffn_embed_dim,
+        d_model=args.encoder_embed_dim,
+        dropout=args.dropout,
+        attention_dropout=args.attention_dropout,
+        activation_dropout=args.activation_dropout,
+        activation_function="relu",
+        num_conv_layers=len(conv_kernel_sizes),
+        conv_channels=args.conv_channels,
+        conv_kernel_sizes=conv_kernel_sizes,
+        input_feat_per_channel=args.input_feat_per_channel,
+        input_channels=args.input_channels,
+        tie_word_embeddings=tie_embeds,
+        num_beams=5,
+        max_length=200,
+        use_cache=True,
+        decoder_start_token_id=2,
+        early_stopping=True,
+    )
+
+    model = WhisperForConditionalGeneration(config)
+    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
+    if len(missing) > 0 and not set(missing) <= set(
+        [
+            "encoder.embed_positions.weights",
+            "decoder.embed_positions.weights",
+        ]
+    ):
+        raise ValueError(
+            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
+            f" but all the following weights are missing {missing}"
+        )
+
+    if tie_embeds:
+        model.lm_head = make_linear_from_emb(model.model.decoder.embed_tokens)
+    else:
+        model.lm_head.weight.data = lm_head_weights
+
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    args = parser.parse_args()
+
+    tiny_config = WhisperConfig(
+        vocab_size = 51865,
+        encoder_layers = 4, 
+        encoder_attention_heads = 6,
+        decoder_attention_heads = 6,
+        decoder_layers = 4,
+        d_model = 384, 
+    )
+
+
+
+
+
+
+    convert_fairseq_s2t_checkpoint_to_tfms(args.fairseq_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 8f1fc405dcabe..b5b4035bae96c 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -111,7 +111,7 @@ def __init__(self, config):
         self.kernel_sizes = config.conv_kernel_sizes
 
         self.conv_layers = nn.ModuleList(
-            nn.nn.Conv1d(
+            nn.Conv1d(
                 self.in_channels if i == 0 else self.mid_channels // 2,
                 self.mid_channels if i < self.num_layers - 1 else self.out_channels * 2,
                 kernel_size=k,
@@ -236,10 +236,10 @@ def __init__(
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
-        self.k_proj = nn.nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -464,16 +464,16 @@ def __init__(self, config: WhisperConfig):
         )
         self.activation_fn = ACT2FN[config.activation_function]
 
-        self.self_attn_layer_norm = nn.nn.LayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = WhisperAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             is_decoder=True,
         )
-        self.encoder_attn_layer_norm = nn.nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.nn.Linear(self.embed_dim, 4 * self.embed_dim)
-        self.fc2 = nn.nn.Linear(4 * self.embed_dim, self.embed_dim)
-        self.final_layer_norm = nn.nn.LayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, 4 * self.embed_dim)
+        self.fc2 = nn.Linear(4 * self.embed_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
         self,
@@ -571,7 +571,7 @@ class WhisperPreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         std = self.config.init_std
-        if isinstance(module, (nn.nn.Linear, nn.nn.Conv1d)):
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
                 module.bias.data.zero_()
@@ -732,11 +732,12 @@ def __init__(self, config: WhisperConfig):
         super().__init__(config)
 
         embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
         self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_source_positions
         self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
 
-        self.conv1 = nn.Conv1d(self.n_mels, embed_dim, kernel_size=3, padding=1)
+        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
         self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
 
         self.embed_positions = WhisperSinusoidalPositionalEmbedding(
@@ -745,7 +746,7 @@ def __init__(self, config: WhisperConfig):
             self.padding_idx,
         )
         self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layer_norm = nn.nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -893,13 +894,13 @@ def __init__(self, config: WhisperConfig):
         self.max_target_positions = config.max_target_positions
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
-        self.embed_tokens = nn.Linear(config.vocab_size, config.d_model, self.padding_idx, bias=False)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
 
         self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model)
 
         self.layers = nn.ModuleList([WhisperDecoderLayer(config) for _ in range(config.decoder_layers)])
 
-        self.layer_norm = nn.nn.LayerNorm(config.d_model)
+        self.layer_norm = nn.LayerNorm(config.d_model)
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1157,7 +1158,7 @@ def __init__(self, config: WhisperConfig):
 
         self.encoder = WhisperEncoder(config)
         self.decoder = WhisperDecoder(config)
-
+        self.proj_out = nn.Linear(config.d_model, config.vocab_size,bias=False)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1168,10 +1169,10 @@ def set_input_embeddings(self, value):
         self.decoder.embed_tokens = value
 
     def set_output_embeddings(self, new_embeddings):
-        self.decoder.embed_tokens = new_embeddings
+        self.proj_out = new_embeddings
 
     def get_output_embeddings(self):
-        return self.decoder.embed_tokens
+        return self.proj_out
 
     def get_encoder(self):
         return self.encoder
@@ -1306,7 +1307,7 @@ class WhisperForConditionalGeneration(WhisperPreTrainedModel):
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
         self.model = WhisperModel(config)
-        self.lm_head = nn.nn.Linear(config.d_model, self.config.vocab_size, bias=False)
+        self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False)
 
         # Initialize weights and apply final processing
         self.post_init()

From a916bf125585866d58578a00a5fd366eb3cdc6b7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 11:27:59 +0000
Subject: [PATCH 005/156] add dropout

---
 .../models/whisper/configuration_whisper.py   | 171 ++++++++++++++++++
 .../whisper/convert_openai_whisper_to_tfms.py |  20 +-
 .../models/whisper/modeling_whisper.py        |  61 ++-----
 3 files changed, 204 insertions(+), 48 deletions(-)
 create mode 100644 src/transformers/models/whisper/configuration_whisper.py

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
new file mode 100644
index 0000000000000..296f307cf1c09
--- /dev/null
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Whisper model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/config.json",
+}
+
+
+class WhisperConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate an
+    Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Whisper
+    [openai/whisper-base](https://huggingface.co/openai/whisper-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`WhisperModel`]
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        max_source_positions (`int`, *optional*, defaults to 6000):
+            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+        max_target_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        num_conv_layers (`int`, *optional*, defaults to 2):
+            Number of 1D convolutional layers in the conv module.
+        conv_kernel_sizes (`Tuple[int]`, *optional*, defaults to `(5, 5)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the conv module. The length
+            of `conv_kernel_sizes` has to match `num_conv_layers`.
+        conv_channels (`int`, *optional*, defaults to 1024):
+            An integer defining the number of output channels of each convolution layers except the final one in the
+            conv module.
+        input_feat_per_channel (`int`, *optional*, defaults to 80):
+            An integer specifying the size of feature vector. This is also the dimensions of log-mel filter-bank
+            features.
+        input_channels (`int`, *optional*, defaults to 1):
+            An integer specifying number of input channels of the input feature vector.
+
+    Example:
+
+    ```python
+    >>> from transformers import WhisperModel, WhisperConfig
+
+    >>> # Initializing a Whisper s2t_transformer_s style configuration
+    >>> configuration = WhisperConfig()
+
+    >>> # Initializing a model from the s2t_transformer_s style configuration
+    >>> model = WhisperModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "whisper"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=10000,
+        num_mel_bins=80,
+        encoder_layers=12,
+        encoder_attention_heads=4,
+        decoder_layers=6,
+        decoder_attention_heads=4,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=256,
+        dropout=0.0,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        scale_embedding=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        input_feat_per_channel=80,
+        input_channels=1,
+        tie_word_embeddings=True,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.tie_word_embeddings = tie_word_embeddings
+        self.input_feat_per_channel = input_feat_per_channel
+        self.input_channels = input_channels
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 8de6af914c235..4176ed3775837 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -35,6 +35,20 @@ def remove_ignore_keys_(state_dict):
         state_dict.pop(k, None)
 
 
+WHISPER_MAPPING = {
+    "blocks" : "layers",
+    "mlp.0":"fc1",
+    "mlp:2":"fc2",
+    "attn_ln":"self_attn_layer_norm",
+    "attn.out":"self_attn.out_proj",
+    "mlp_ln":"final_layer_norm",
+    "blocks":"layers",
+    "attn.query":"self_attn.q_proj",
+    "attn.key":"self_attn.k_proj",
+    "attn.value":"self_attn.v_proj"
+
+}
+
 def rename_keys(s_dict):
     keys = list(s_dict.keys())
     for key in keys:
@@ -44,11 +58,7 @@ def rename_keys(s_dict):
             s_dict[key.replace("attn.key", "self_k.q_proj")] = s_dict.pop(key)
             s_dict[key.replace("attn.value", "self_attn.v_proj")] = s_dict.pop(key)
 
-            "mlp.0":"fc1"
-            "mlp:2":"fc2"
-            "attn_ln":"self_attn_layer_norm"
-            "attn.out":"self_attn.out_proj"
-            "mlp_ln":"final_layer_norm"
+
 
 
         elif "subsample" in key:
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index b5b4035bae96c..d737b1498b728 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -357,35 +357,7 @@ def forward(
 
         return attn_output, attn_weights_reshaped, past_key_value
 
-
-class WhisperResidualAttentionBlock(nn.Module):
-    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
-        super().__init__()
-
-        self.attn = WhisperAttention(n_state, n_head)
-        self.attn_ln = nn.LayerNorm(n_state)
-
-        self.cross_attn = WhisperAttention(n_state, n_head) if cross_attention else None
-        self.cross_attn_ln = nn.LayerNorm(n_state) if cross_attention else None
-
-        n_mlp = n_state * 4
-        self.mlp = nn.Sequential(nn.Linear(n_state, n_mlp), nn.GELU(), nn.Linear(n_mlp, n_state))
-        self.mlp_ln = nn.LayerNorm(n_state)
-
-    def forward(
-        self,
-        x: Tensor,
-        xa: Optional[Tensor] = None,
-        mask: Optional[Tensor] = None,
-        kv_cache: Optional[dict] = None,
-    ):
-        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)
-        if self.cross_attn:
-            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)
-        x = x + self.mlp(self.mlp_ln(x))
-        return x
-
-
+# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextEncoderLayer with Speech2Text->Whisper
 class WhisperEncoderLayer(nn.Module):
     def __init__(self, config: WhisperConfig):
         super().__init__()
@@ -396,8 +368,9 @@ def __init__(self, config: WhisperConfig):
             dropout=config.attention_dropout,
         )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
-
+        self.activation_dropout = config.activation_dropout
         self.fc1 = nn.Linear(self.embed_dim, 4 * self.embed_dim)
         self.fc2 = nn.Linear(4 * self.embed_dim, self.embed_dim)
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -428,12 +401,14 @@ def forward(
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
-
         hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
 
         hidden_states = residual + hidden_states
@@ -451,7 +426,7 @@ def forward(
 
         return outputs
 
-
+# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextDecoderLayer with Speech2Text->Whisper
 class WhisperDecoderLayer(nn.Module):
     def __init__(self, config: WhisperConfig):
         super().__init__()
@@ -460,14 +435,18 @@ def __init__(self, config: WhisperConfig):
         self.self_attn = WhisperAttention(
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
             is_decoder=True,
         )
+        self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.encoder_attn = WhisperAttention(
             self.embed_dim,
             config.decoder_attention_heads,
+            dropout=config.attention_dropout,
             is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
@@ -519,6 +498,7 @@ def forward(
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
         # Cross-Attention Block
@@ -537,6 +517,7 @@ def forward(
                 past_key_value=cross_attn_past_key_value,
                 output_attentions=output_attentions,
             )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
             hidden_states = residual + hidden_states
 
             # add cross-attn to positions 3,4 of present_key_value tuple
@@ -544,11 +525,11 @@ def forward(
 
         # Fully Connected
         residual = hidden_states
-
         hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
-
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
         outputs = (hidden_states,)
@@ -717,7 +698,6 @@ def _get_feature_vector_attention_mask(self, feature_vector_length, attention_ma
 """
 
 
-# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextEncoder with Speech2Text->Whisper
 class WhisperEncoder(WhisperPreTrainedModel):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
@@ -813,9 +793,7 @@ def forward(
         embed_pos = self.embed_positions(padding_mask)
 
         hidden_states = inputs_embeds + embed_pos
-        hidden_states = nn.functional.dropout(
-            hidden_states, p=self.dropout, training=self.training
-        )  # TODO should we remove all dropout?
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
         # expand attention_mask
         if attention_mask is not None:
@@ -895,7 +873,6 @@ def __init__(self, config: WhisperConfig):
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
-
         self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model)
 
         self.layers = nn.ModuleList([WhisperDecoderLayer(config) for _ in range(config.decoder_layers)])
@@ -1047,9 +1024,7 @@ def forward(
         positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
 
         hidden_states = inputs_embeds + positions
-        hidden_states = nn.functional.dropout(
-            hidden_states, p=self.dropout, training=self.training
-        )  # TODO should we remove all of em?
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
@@ -1269,7 +1244,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        decoder_outputs = self.decoder.embed_tokens(decoder_outputs)
+        decoder_outputs = self.proj_out(decoder_outputs)
 
         if not return_dict:
             return decoder_outputs + encoder_outputs

From 7ebda7df2c516bdddcba239ee725aa47ae3cfa04 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 11:28:34 +0000
Subject: [PATCH 006/156] initial commit of test files

---
 docs/source/en/model_doc/whisper.mdx          |  66 ++
 src/transformers/__init__.py                  |  25 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 src/transformers/models/auto/modeling_auto.py |   3 +
 .../models/auto/processing_auto.py            |   1 +
 .../models/auto/tokenization_auto.py          |   1 +
 tests/models/whisper/__init__.py              |   0
 .../test_feature_extraction_whisper.py        | 250 ++++++
 tests/models/whisper/test_modeling_whisper.py | 780 ++++++++++++++++++
 .../models/whisper/test_processor_whisper.py  | 144 ++++
 .../whisper/test_tokenization_whisper.py      | 163 ++++
 13 files changed, 1438 insertions(+)
 create mode 100644 docs/source/en/model_doc/whisper.mdx
 create mode 100644 tests/models/whisper/__init__.py
 create mode 100644 tests/models/whisper/test_feature_extraction_whisper.py
 create mode 100644 tests/models/whisper/test_modeling_whisper.py
 create mode 100644 tests/models/whisper/test_processor_whisper.py
 create mode 100644 tests/models/whisper/test_tokenization_whisper.py

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
new file mode 100644
index 0000000000000..a5e02c5f1a81b
--- /dev/null
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -0,0 +1,66 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Whisper
+
+## Overview
+
+The Whisper model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## WhisperConfig
+
+[[autodoc]] WhisperConfig
+
+## WhisperTokenizer
+
+[[autodoc]] WhisperTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## WhisperFeatureExtractor
+
+[[autodoc]] WhisperFeatureExtractor
+    - __call__
+
+## WhisperProcessor
+
+[[autodoc]] WhisperProcessor
+    - __call__
+    - from_pretrained
+    - save_pretrained
+    - batch_decode
+    - decode
+
+## WhisperModel
+
+[[autodoc]] WhisperModel
+    - forward
+
+## WhisperForConditionalGeneration
+
+[[autodoc]] WhisperForConditionalGeneration
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 3c3a3a5006416..3c9de9e2f8a04 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -376,6 +376,10 @@
         "WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "WavLMConfig",
     ],
+    "models.whisper": [
+        "WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "WhisperConfig",
+    ],
     "models.x_clip": [
         "XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "XCLIPConfig",
@@ -517,6 +521,7 @@
     _import_structure["models.reformer"].append("ReformerTokenizer")
     _import_structure["models.rembert"].append("RemBertTokenizer")
     _import_structure["models.speech_to_text"].append("Speech2TextTokenizer")
+    _import_structure["models.whisper"].append("WhisperTokenizer")
     _import_structure["models.t5"].append("T5Tokenizer")
     _import_structure["models.xglm"].append("XGLMTokenizer")
     _import_structure["models.xlm_prophetnet"].append("XLMProphetNetTokenizer")
@@ -617,6 +622,7 @@
 else:
     _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
+    _import_structure["models.whisper"].append("WhisperFeatureExtractor")
 
 # Tensorflow-text-specific objects
 try:
@@ -642,6 +648,7 @@
     ]
 else:
     _import_structure["models.speech_to_text"].append("Speech2TextProcessor")
+    _import_structure["models.whisper"].append("WhisperProcessor")
 
 # Vision-specific objects
 try:
@@ -1805,6 +1812,14 @@
             "Speech2TextPreTrainedModel",
         ]
     )
+    _import_structure["models.whisper"].extend(
+        [
+            "WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "WhisperForConditionalGeneration",
+            "WhisperModel",
+            "WhisperPreTrainedModel",
+        ]
+    )
     _import_structure["models.speech_to_text_2"].extend(["Speech2Text2ForCausalLM", "Speech2Text2PreTrainedModel"])
     _import_structure["models.splinter"].extend(
         [
@@ -3250,6 +3265,7 @@
     from .models.wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizer
     from .models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
     from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
+    from .models.whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig
     from .models.x_clip import (
         XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         XCLIPConfig,
@@ -3390,6 +3406,7 @@
         from .models.rembert import RemBertTokenizer
         from .models.speech_to_text import Speech2TextTokenizer
         from .models.t5 import T5Tokenizer
+        from .models.whisper import WhisperTokenizer
         from .models.xglm import XGLMTokenizer
         from .models.xlm_prophetnet import XLMProphetNetTokenizer
         from .models.xlm_roberta import XLMRobertaTokenizer
@@ -3472,6 +3489,7 @@
     else:
         from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
+        from .models.whisper import WhisperFeatureExtractor
 
     try:
         if not is_tensorflow_text_available():
@@ -3488,6 +3506,7 @@
         from .utils.dummy_sentencepiece_and_speech_objects import *
     else:
         from .models.speech_to_text import Speech2TextProcessor
+        from .models.whisper import WhisperProcessor
 
     try:
         if not is_vision_available():
@@ -4594,6 +4613,12 @@
             WavLMModel,
             WavLMPreTrainedModel,
         )
+        from .models.whisper import (
+            WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            WhisperForConditionalGeneration,
+            WhisperModel,
+            WhisperPreTrainedModel,
+        )
         from .models.x_clip import (
             XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
             XCLIPModel,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index fbdbfd579cb9e..4454f2e837afc 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -154,6 +154,7 @@
     wav2vec2_phoneme,
     wav2vec2_with_lm,
     wavlm,
+    whisper,
     x_clip,
     xglm,
     xlm,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 1204e6608a768..d5db9755b2a96 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -147,6 +147,7 @@
         ("wav2vec2", "Wav2Vec2Config"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"),
         ("wavlm", "WavLMConfig"),
+        ("whisper", "WhisperConfig"),
         ("xclip", "XCLIPConfig"),
         ("xglm", "XGLMConfig"),
         ("xlm", "XLMConfig"),
@@ -266,6 +267,7 @@
         ("vit_mae", "VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("wav2vec2-conformer", "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("whisper", "WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xclip", "X_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xglm", "XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -419,6 +421,7 @@
         ("wav2vec2-conformer", "Wav2Vec2-Conformer"),
         ("wav2vec2_phoneme", "Wav2Vec2Phoneme"),
         ("wavlm", "WavLM"),
+        ("whisper", "Whisper"),
         ("xclip", "X-CLIP"),
         ("xglm", "XGLM"),
         ("xlm", "XLM"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 015fd132ef0dc..aa1b25a0b8fad 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -76,6 +76,7 @@
         ("vit_mae", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
         ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),
+        ("whisper", "WhisperFeatureExtractor"),
         ("xclip", "CLIPFeatureExtractor"),
         ("yolos", "YolosFeatureExtractor"),
     ]
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7f4968d03cdf6..a0cf200260fee 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -141,6 +141,7 @@
         ("wav2vec2", "Wav2Vec2Model"),
         ("wav2vec2-conformer", "Wav2Vec2ConformerModel"),
         ("wavlm", "WavLMModel"),
+        ("whisper", "WhisperModel"),
         ("xclip", "XCLIPModel"),
         ("xglm", "XGLMModel"),
         ("xlm", "XLMModel"),
@@ -266,6 +267,7 @@
         ("tapas", "TapasForMaskedLM"),
         ("transfo-xl", "TransfoXLLMHeadModel"),
         ("wav2vec2", "Wav2Vec2ForMaskedLM"),
+        ("whisper", "WhisperForConditionalGeneration"),
         ("xlm", "XLMWithLMHeadModel"),
         ("xlm-roberta", "XLMRobertaForMaskedLM"),
         ("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"),
@@ -491,6 +493,7 @@
     [
         ("speech-encoder-decoder", "SpeechEncoderDecoderModel"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
+        ("whisper", "WhisperForConditionalGeneration"),
     ]
 )
 
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 07b2811a16481..1313415b0fc73 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -60,6 +60,7 @@
         ("wav2vec2-conformer", "Wav2Vec2Processor"),
         ("wav2vec2_with_lm", "Wav2Vec2ProcessorWithLM"),
         ("wavlm", "Wav2Vec2Processor"),
+        ("whisper", "WhisperProcessor"),
         ("xclip", "CLIPProcessor"),
     ]
 )
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 97e048885e180..43fb6ce352a3f 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -255,6 +255,7 @@
             ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)),
             ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)),
+            ("whisper", ("WhisperTokenizer" if is_sentencepiece_available() else None, None)),
             ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "xglm",
diff --git a/tests/models/whisper/__init__.py b/tests/models/whisper/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
new file mode 100644
index 0000000000000..1c16b348adadb
--- /dev/null
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import itertools
+import random
+import unittest
+
+import numpy as np
+
+from transformers import is_speech_available
+from transformers.testing_utils import require_torch, require_torchaudio
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_speech_available():
+    from transformers import WhisperFeatureExtractor
+
+global_rng = random.Random()
+
+
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+@require_torch
+@require_torchaudio
+class WhisperFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=24,
+        num_mel_bins=24,
+        padding_value=0.0,
+        sampling_rate=16_000,
+        return_attention_mask=True,
+        do_normalize=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.feature_size = feature_size
+        self.num_mel_bins = num_mel_bins
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "num_mel_bins": self.num_mel_bins,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+            "do_normalize": self.do_normalize,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = [floats_list((self.max_seq_length, self.feature_size)) for _ in range(self.batch_size)]
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                floats_list((x, self.feature_size))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+
+
+@require_torch
+@require_torchaudio
+class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+
+    feature_extraction_class = WhisperFeatureExtractor if is_speech_available() else None
+
+    def setUp(self):
+        self.feat_extract_tester = WhisperFeatureExtractionTester(self)
+
+    def _check_zero_mean_unit_variance(self, input_vector):
+        self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
+        self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test feature size
+        input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
+        self.assertTrue(input_features.ndim == 3)
+        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
+
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_cepstral_mean_and_variance_normalization(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+
+        paddings = ["longest", "max_length", "do_not_pad"]
+        max_lengths = [None, 16, None]
+        for max_length, padding in zip(max_lengths, paddings):
+            inputs = feature_extractor(
+                speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True
+            )
+            input_features = inputs.input_features
+            attention_mask = inputs.attention_mask
+            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
+
+            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
+            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
+            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
+
+    def test_cepstral_mean_and_variance_normalization_np(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+
+        paddings = ["longest", "max_length", "do_not_pad"]
+        max_lengths = [None, 16, None]
+        for max_length, padding in zip(max_lengths, paddings):
+            inputs = feature_extractor(
+                speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True
+            )
+            input_features = inputs.input_features
+            attention_mask = inputs.attention_mask
+            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
+
+            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
+            self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6)
+            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
+            self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6)
+            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
+
+    def test_cepstral_mean_and_variance_normalization_trunc_max_length(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        inputs = feature_extractor(
+            speech_inputs,
+            padding="max_length",
+            max_length=4,
+            truncation=True,
+            return_tensors="np",
+            return_attention_mask=True,
+        )
+        input_features = inputs.input_features
+        attention_mask = inputs.attention_mask
+        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
+
+        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
+        self._check_zero_mean_unit_variance(input_features[1])
+        self._check_zero_mean_unit_variance(input_features[2])
+
+    def test_cepstral_mean_and_variance_normalization_trunc_longest(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        inputs = feature_extractor(
+            speech_inputs,
+            padding="longest",
+            max_length=4,
+            truncation=True,
+            return_tensors="np",
+            return_attention_mask=True,
+        )
+        input_features = inputs.input_features
+        attention_mask = inputs.attention_mask
+        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
+
+        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
+        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
+        self._check_zero_mean_unit_variance(input_features[2])
+
+        # make sure that if max_length < longest -> then pad to max_length
+        self.assertEqual(input_features.shape, (3, 4, 24))
+
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        inputs = feature_extractor(
+            speech_inputs,
+            padding="longest",
+            max_length=16,
+            truncation=True,
+            return_tensors="np",
+            return_attention_mask=True,
+        )
+        input_features = inputs.input_features
+        attention_mask = inputs.attention_mask
+        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
+
+        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
+        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
+        self._check_zero_mean_unit_variance(input_features[2])
+
+        # make sure that if max_length < longest -> then pad to max_length
+        self.assertEqual(input_features.shape, (3, 6, 24))
+
+    def test_double_precision_pad(self):
+        import torch
+
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_features.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
new file mode 100644
index 0000000000000..bdf6377c98613
--- /dev/null
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -0,0 +1,780 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Whisper model. """
+
+import copy
+import inspect
+import os
+import tempfile
+import unittest
+
+from transformers import WhisperConfig
+from transformers.testing_utils import (
+    is_torch_available,
+    require_sentencepiece,
+    require_tokenizers,
+    require_torch,
+    require_torchaudio,
+    slow,
+    torch_device,
+)
+from transformers.utils import cached_property
+
+from ...generation.test_generation_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import WhisperForConditionalGeneration, WhisperModel, WhisperProcessor
+    from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder
+
+
+def prepare_whisper_inputs_dict(
+    config,
+    input_features,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+    head_mask=None,
+    decoder_head_mask=None,
+    cross_attn_head_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_features.ne(0)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    if head_mask is None:
+        head_mask = torch.ones(config.encoder_layers, config.encoder_attention_heads, device=torch_device)
+    if decoder_head_mask is None:
+        decoder_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    if cross_attn_head_mask is None:
+        cross_attn_head_mask = torch.ones(config.decoder_layers, config.decoder_attention_heads, device=torch_device)
+    return {
+        # "input_ids": input_features,
+        "input_features": input_features,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+        "head_mask": head_mask,
+        "decoder_head_mask": decoder_head_mask,
+        "cross_attn_head_mask": cross_attn_head_mask,
+    }
+
+
+@require_torch
+class WhisperModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        num_conv_layers=2,
+        conv_kernel_sizes=(5, 5),
+        conv_channels=32,
+        input_feat_per_channel=24,
+        input_channels=1,
+        hidden_act="relu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        max_source_positions=20,
+        max_target_positions=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.num_conv_layers = num_conv_layers
+        self.conv_kernel_sizes = conv_kernel_sizes
+        self.conv_channels = conv_channels
+        self.input_feat_per_channel = input_feat_per_channel
+        self.input_channels = input_channels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_source_positions = max_source_positions
+        self.max_target_positions = max_target_positions
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_features = floats_tensor(
+            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
+        )
+        attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(2)
+
+        config = self.get_config()
+        inputs_dict = prepare_whisper_inputs_dict(
+            config,
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            attention_mask=attention_mask,
+        )
+        return config, inputs_dict
+
+    def get_config(self):
+        return WhisperConfig(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            num_conv_layers=self.num_conv_layers,
+            conv_kernel_sizes=self.conv_kernel_sizes,
+            conv_channels=self.conv_channels,
+            input_feat_per_channel=self.input_feat_per_channel,
+            input_channels=self.input_channels,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            max_source_positions=self.max_source_positions,
+            max_target_positions=self.max_target_positions,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def get_subsampled_output_lengths(self, input_lengths):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        for i in range(self.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+    def create_and_check_model_forward(self, config, inputs_dict):
+        model = WhisperModel(config=config).to(torch_device).eval()
+
+        input_features = inputs_dict["input_features"]
+        decoder_input_ids = inputs_dict["decoder_input_ids"]
+
+        # first forward pass
+        last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+
+        self.parent.assertTrue(last_hidden_state.shape, (13, 7, 16))
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = WhisperModel(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["decoder_input_ids"]
+        attention_mask = inputs_dict["decoder_attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size).clamp(2)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
+            "last_hidden_state"
+        ]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = WhisperModel(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = WhisperEncoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(
+            inputs_dict["input_features"], attention_mask=inputs_dict["attention_mask"]
+        )[0]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = WhisperDecoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_attention_mask = encoder._get_feature_vector_attention_mask(
+            encoder_last_hidden_state.shape[1], inputs_dict["attention_mask"]
+        )
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=encoder_attention_mask,
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (WhisperModel, WhisperForConditionalGeneration) if is_torch_available() else ()
+    all_generative_model_classes = (WhisperForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    fx_compatible = False
+    test_pruning = False
+    test_missing_keys = False
+
+    input_name = "input_features"
+
+    def setUp(self):
+        self.model_tester = WhisperModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=WhisperConfig)
+        self.maxDiff = 3000
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_model_forward(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model_forward(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # not implemented currently
+    def test_inputs_embeds(self):
+        pass
+
+    # training is not supported yet
+    def test_training(self):
+        pass
+
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_features = input_dict["input_features"]
+        attention_mask = input_dict["attention_mask"]
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            input_features = input_features.half()
+            model.half()
+        model.generate(input_features, attention_mask=attention_mask)
+        model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_features",
+                "attention_mask",
+                "decoder_input_ids",
+                "decoder_attention_mask",
+            ]
+            expected_arg_names.extend(
+                ["head_mask", "decoder_head_mask", "cross_attn_head_mask", "encoder_outputs"]
+                if "head_mask" and "decoder_head_mask" and "cross_attn_head_mask" in arg_names
+                else ["encoder_outputs"]
+            )
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            subsampled_seq_length = model._get_feat_extract_output_lengths(seq_length)
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [subsampled_seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(encoder_seq_length)
+            subsampled_encoder_key_length = model._get_feat_extract_output_lengths(encoder_key_length)
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            correct_outlen = 5
+
+            # loss is at first position
+            if "labels" in inputs_dict:
+                correct_outlen += 1  # loss is added to beginning
+            if "past_key_values" in outputs:
+                correct_outlen += 1  # past_key_values have been returned
+
+            self.assertEqual(out_len, correct_outlen)
+
+            # decoder attentions
+            decoder_attentions = outputs.decoder_attentions
+            self.assertIsInstance(decoder_attentions, (list, tuple))
+            self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(decoder_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+            )
+
+            # cross attentions
+            cross_attentions = outputs.cross_attentions
+            self.assertIsInstance(cross_attentions, (list, tuple))
+            self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(cross_attentions[0].shape[-3:]),
+                [
+                    self.model_tester.num_attention_heads,
+                    decoder_seq_length,
+                    subsampled_encoder_key_length,
+                ],
+            )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            added_hidden_states = 2
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, subsampled_encoder_seq_length, subsampled_encoder_key_length],
+            )
+
+    def test_resize_tokens_embeddings(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config)
+            model.to(torch_device)
+
+            if self.model_tester.is_training is False:
+                model.eval()
+
+            model_vocab_size = config.vocab_size
+            # Retrieve the embeddings and clone theme
+            model_embed = model.resize_token_embeddings(model_vocab_size)
+            cloned_embeddings = model_embed.weight.clone()
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+
+            # make sure that decoder_input_ids are resized
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            models_equal = True
+            for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+    def test_resize_embeddings_untied(self):
+        (
+            original_config,
+            inputs_dict,
+        ) = self.model_tester.prepare_config_and_inputs_for_common()
+        if not self.test_resize_embeddings:
+            return
+
+        original_config.tie_word_embeddings = False
+
+        # if model cannot untied embeddings -> leave test
+        if original_config.tie_word_embeddings:
+            return
+
+        for model_class in self.all_model_classes:
+            config = copy.deepcopy(original_config)
+            model = model_class(config).to(torch_device)
+
+            # if no output embeddings -> leave test
+            if model.get_output_embeddings() is None:
+                continue
+
+            # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            model_vocab_size = config.vocab_size
+            model.resize_token_embeddings(model_vocab_size + 10)
+            self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+            # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            model.resize_token_embeddings(model_vocab_size - 15)
+            self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            # Check that it actually resizes the embeddings matrix
+            output_embeds = model.get_output_embeddings()
+            self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15)
+            # Check bias if present
+            if output_embeds.bias is not None:
+                self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            if "decoder_input_ids" in inputs_dict:
+                inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**self._prepare_for_class(inputs_dict, model_class))
+
+    def test_generate_without_input_ids(self):
+        pass
+
+    @staticmethod
+    def _get_encoder_outputs(
+        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
+    ):
+        encoder = model.get_encoder()
+        encoder_outputs = encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
+            num_interleave, dim=0
+        )
+        input_ids = input_ids[:, :, 0]
+        input_ids = torch.zeros_like(input_ids[:, :1], dtype=torch.long) + model._get_decoder_start_token_id()
+        attention_mask = None
+        return encoder_outputs, input_ids, attention_mask
+
+    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
+        batch_size, seq_length = input_ids.shape[:2]
+        subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+        num_sequences_in_output = batch_size * num_return_sequences
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # scores
+        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+
+        # Attentions
+        # encoder
+        self._check_encoder_attention_for_generate(
+            output.encoder_attentions, batch_size, config, subsampled_seq_length
+        )
+        # decoder
+        self._check_attentions_for_generate(
+            num_sequences_in_output,
+            output.decoder_attentions,
+            min_length=1,
+            max_length=output.sequences.shape[-1],
+            config=config,
+            use_cache=use_cache,
+        )
+
+        # Hidden States
+        # encoder
+        self._check_encoder_hidden_states_for_generate(
+            output.encoder_hidden_states, batch_size, config, subsampled_seq_length
+        )
+
+        # decoder
+        self._check_hidden_states_for_generate(
+            num_sequences_in_output,
+            output.decoder_hidden_states,
+            min_length=1,
+            max_length=output.sequences.shape[-1],
+            config=config,
+            use_cache=use_cache,
+        )
+
+    def _create_and_check_torchscript(self, config, inputs_dict):
+        if not self.test_torchscript:
+            return
+
+        configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+        configs_no_init.torchscript = True
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            model.to(torch_device)
+            model.eval()
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            try:
+                model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                input_features = inputs["input_features"]
+                attention_mask = inputs["attention_mask"]
+                decoder_input_ids = inputs["decoder_input_ids"]
+                decoder_attention_mask = inputs["decoder_attention_mask"]
+                traced_model = torch.jit.trace(
+                    model, (input_features, attention_mask, decoder_input_ids, decoder_attention_mask)
+                )
+            except RuntimeError:
+                self.fail("Couldn't trace module.")
+
+            with tempfile.TemporaryDirectory() as tmp_dir_name:
+                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                try:
+                    torch.jit.save(traced_model, pt_file_name)
+                except Exception:
+                    self.fail("Couldn't save module.")
+
+                try:
+                    loaded_model = torch.jit.load(pt_file_name)
+                except Exception:
+                    self.fail("Couldn't load module.")
+
+            model.to(torch_device)
+            model.eval()
+
+            loaded_model.to(torch_device)
+            loaded_model.eval()
+
+            model_state_dict = model.state_dict()
+            loaded_model_state_dict = loaded_model.state_dict()
+
+            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+
+            models_equal = True
+            for layer_name, p1 in model_state_dict.items():
+                p2 = loaded_model_state_dict[layer_name]
+                if p1.data.ne(p2.data).sum() > 0:
+                    models_equal = False
+
+            self.assertTrue(models_equal)
+
+
+@require_torch
+@require_torchaudio
+@require_sentencepiece
+@require_tokenizers
+@slow
+class WhisperModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_processor(self):
+        return WhisperProcessor.from_pretrained("openai/whisper-base")
+
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_generation_librispeech(self):
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+        model.to(torch_device)
+        processor = self.default_processor
+
+        input_speech = self._load_datasamples(1)
+
+        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        generated_ids = model.generate(input_features)
+        generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
+        ]
+        self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS)
+
+    def test_generation_librispeech_batched(self):
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+        model.to(torch_device)
+        processor = self.default_processor
+
+        input_speech = self._load_datasamples(4)
+
+        inputs = processor(input_speech, return_tensors="pt", padding=True)
+
+        input_features = inputs.input_features.to(torch_device)
+        attention_mask = inputs.attention_mask.to(torch_device)
+
+        generated_ids = model.generate(input_features, attention_mask=attention_mask)
+        generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+        EXPECTED_TRANSCRIPTIONS = [
+            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
+            "nor is mister cultar's manner less interesting than his matter",
+            "he tells us that at this festive season of the year with christmas and roast beef looming before us"
+            " similes drawn from eating and its results occur most readily to the mind",
+            "he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it"
+            " but little of rocky ithaca",
+        ]
+
+        self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
new file mode 100644
index 0000000000000..cf483e44a2fea
--- /dev/null
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -0,0 +1,144 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+from shutil import copyfile
+
+from transformers import WhisperTokenizer, is_speech_available
+from transformers.models.whisper.tokenization_whisper import VOCAB_FILES_NAMES, save_json
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, require_torchaudio
+from transformers.utils import FEATURE_EXTRACTOR_NAME
+
+from .test_feature_extraction_whisper import floats_list
+
+
+if is_speech_available():
+    from transformers import WhisperFeatureExtractor, WhisperProcessor
+
+
+SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
+
+
+@require_torch
+@require_torchaudio
+@require_sentencepiece
+class WhisperProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+
+        vocab = ["<s>", "<pad>", "</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        save_dir = Path(self.tmpdirname)
+        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
+        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
+
+        tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+        feature_extractor_map = {
+            "feature_size": 24,
+            "num_mel_bins": 24,
+            "padding_value": 0.0,
+            "sampling_rate": 16000,
+            "return_attention_mask": False,
+            "do_normalize": True,
+        }
+        save_json(feature_extractor_map, save_dir / FEATURE_EXTRACTOR_NAME)
+
+    def get_tokenizer(self, **kwargs):
+        return WhisperTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_feature_extractor(self, **kwargs):
+        return WhisperFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_save_load_pretrained_default(self):
+        tokenizer = self.get_tokenizer()
+        feature_extractor = self.get_feature_extractor()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        processor.save_pretrained(self.tmpdirname)
+        processor = WhisperProcessor.from_pretrained(self.tmpdirname)
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
+        self.assertIsInstance(processor.tokenizer, WhisperTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
+
+    def test_save_load_pretrained_additional_features(self):
+        processor = WhisperProcessor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
+        processor.save_pretrained(self.tmpdirname)
+
+        tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
+        feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
+
+        processor = WhisperProcessor.from_pretrained(
+            self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
+        )
+
+        self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
+        self.assertIsInstance(processor.tokenizer, WhisperTokenizer)
+
+        self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
+        self.assertIsInstance(processor.feature_extractor, WhisperFeatureExtractor)
+
+    def test_feature_extractor(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+    def test_tokenizer(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        input_str = "This is a test string"
+
+        encoded_processor = processor(text=input_str)
+
+        encoded_tok = tokenizer(input_str)
+
+        for key in encoded_tok.keys():
+            self.assertListEqual(encoded_tok[key], encoded_processor[key])
+
+    def test_tokenizer_decode(self):
+        feature_extractor = self.get_feature_extractor()
+        tokenizer = self.get_tokenizer()
+
+        processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+
+        predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
+
+        decoded_processor = processor.batch_decode(predicted_ids)
+        decoded_tok = tokenizer.batch_decode(predicted_ids)
+
+        self.assertListEqual(decoded_tok, decoded_processor)
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
new file mode 100644
index 0000000000000..d568421b3200d
--- /dev/null
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -0,0 +1,163 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from pathlib import Path
+from shutil import copyfile
+
+from transformers import SPIECE_UNDERLINE, is_sentencepiece_available
+from transformers.models.whisper import WhisperTokenizer
+from transformers.models.whisper.tokenization_whisper import VOCAB_FILES_NAMES, save_json
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
+
+if is_sentencepiece_available():
+    import sentencepiece as sp
+
+
+FR_CODE = 5
+ES_CODE = 10
+
+
+@require_sentencepiece
+@require_tokenizers
+class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = WhisperTokenizer
+    test_rust_tokenizer = False
+    test_sentencepiece = True
+
+    def setUp(self):
+        super().setUp()
+
+        spm_model = sp.SentencePieceProcessor()
+        spm_model.Load(SAMPLE_SP)
+        vocab = ["<s>", "<pad>", "</s>", "<unk>"]
+
+        vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+        save_dir = Path(self.tmpdirname)
+        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
+        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
+            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
+
+        tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def test_convert_token_and_id(self):
+        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
+        token = "<pad>"
+        token_id = 1
+
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+
+        self.assertEqual(vocab_keys[0], "<s>")
+        self.assertEqual(vocab_keys[1], "<pad>")
+        self.assertEqual(vocab_keys[-1], "j")
+        self.assertEqual(len(vocab_keys), 1_001)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_001)
+
+    def test_full_tokenizer(self):
+        tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [289, 50, 14, 174, 386],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual(
+            tokens,
+            # fmt: off
+            [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", "."],
+            # fmt: on
+        )
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [12, 25, 88, 59, 28, 23, 11, 4, 606, 351, 351, 351, 7, 16, 70, 50, 76, 84, 10, 4, 8])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(
+            back_tokens,
+            # fmt: off
+            [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", "."],
+            # fmt: on
+        )
+
+    @slow
+    def test_tokenizer_integration(self):
+        # fmt: off
+        expected_encoding = {'input_ids': [[3791, 797, 31, 11, 64, 797, 31, 2429, 433, 12, 1176, 12, 20, 786, 915, 142, 2413, 240, 37, 3238, 797, 31, 11, 35, 93, 915, 142, 2413, 240, 37, 5540, 567, 1276, 93, 37, 610, 40, 62, 455, 657, 1042, 123, 780, 177, 37, 309, 241, 1298, 514, 20, 292, 2737, 114, 2469, 241, 85, 64, 302, 548, 528, 423, 4, 509, 406, 423, 37, 601, 4, 777, 302, 548, 528, 423, 284, 4, 3388, 511, 459, 4, 3555, 40, 321, 302, 705, 4, 3388, 511, 583, 326, 5, 5, 5, 62, 3310, 560, 177, 2680, 217, 1508, 32, 31, 853, 418, 64, 583, 511, 1605, 62, 35, 93, 560, 177, 2680, 217, 1508, 1521, 64, 583, 511, 519, 62, 20, 1515, 764, 20, 149, 261, 5625, 7972, 20, 5540, 567, 1276, 93, 3925, 1675, 11, 15, 802, 7972, 576, 217, 1508, 11, 35, 93, 1253, 2441, 15, 289, 652, 31, 416, 321, 3842, 115, 40, 911, 8, 476, 619, 4, 380, 142, 423, 335, 240, 35, 93, 264, 8, 11, 335, 569, 420, 163, 5, 2], [260, 548, 528, 423, 20, 451, 20, 2681, 1153, 3434, 20, 5540, 37, 567, 126, 1253, 2441, 3376, 449, 210, 431, 1563, 177, 767, 5540, 11, 1203, 472, 11, 2953, 685, 285, 364, 706, 1153, 20, 6799, 20, 2869, 20, 4464, 126, 40, 2429, 20, 1040, 866, 2664, 418, 20, 318, 20, 1726, 186, 20, 265, 522, 35, 93, 2191, 4634, 20, 1040, 12, 6799, 15, 228, 2356, 142, 31, 11, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2575, 2666, 684, 1582, 1176, 12, 627, 149, 619, 20, 4902, 563, 11, 20, 149, 261, 3420, 2356, 174, 142, 4714, 131, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        # fmt: on
+
+        self.tokenizer_integration_test_util(
+            expected_encoding=expected_encoding,
+            model_name="facebook/s2t-small-mustc-en-de-st",
+            revision="a14f04cf0776c02f62a8cb800cf7909e15ea23ad",
+        )
+
+
+@require_sentencepiece
+class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
+    checkpoint_name = "valhalla/s2t_mustc_multilinguial_medium"
+
+    french_text = "C'est trop cool"
+    spanish_text = "Esto es genial"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(cls.checkpoint_name)
+        return cls
+
+    def check_language_codes(self):
+        self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)
+        self.assertEqual(self.tokenizer.lang_code_to_id["ru"], 6)
+        self.assertEqual(self.tokenizer.lang_code_to_id["it"], 9)
+        self.assertEqual(self.tokenizer.lang_code_to_id["de"], 11)
+
+    def test_vocab_size(self):
+        self.assertEqual(self.tokenizer.vocab_size, 10_000)
+
+    def test_tokenizer_decode_ignores_language_codes(self):
+        self.assertIn(ES_CODE, self.tokenizer.all_special_ids)
+        generated_ids = [ES_CODE, 4, 1601, 47, 7647, 2]
+        result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+        expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
+        self.assertEqual(result, expected_spanish)
+        self.assertNotIn(self.tokenizer.eos_token, result)
+
+    def test_tokenizer_adds_special_tokens(self):
+        self.tokenizer.tgt_lang = "fr"
+        encoded = self.tokenizer(self.french_text).input_ids
+        self.assertEqual(encoded[0], FR_CODE)
+        self.assertEqual(encoded[-1], self.tokenizer.eos_token_id)
+
+    def test_tgt_lang_setter(self):
+        self.tokenizer.tgt_lang = "fr"
+        self.assertListEqual(self.tokenizer.prefix_tokens, [FR_CODE])
+
+        self.tokenizer.tgt_lang = "es"
+        self.assertListEqual(self.tokenizer.prefix_tokens, [ES_CODE])

From 974235f95b76a5ebbcab69518ba51be795d89ea7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 12:55:20 +0000
Subject: [PATCH 007/156] copnversion for all models

---
 .../models/whisper/configuration_whisper.py   |   9 +-
 .../whisper/convert_openai_whisper_to_tfms.py | 157 +++++++++++++-----
 .../models/whisper/modeling_whisper.py        |   5 +-
 3 files changed, 125 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 296f307cf1c09..d30f128843343 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -99,10 +99,10 @@ class WhisperConfig(PretrainedConfig):
     ```python
     >>> from transformers import WhisperModel, WhisperConfig
 
-    >>> # Initializing a Whisper s2t_transformer_s style configuration
+    >>> # Initializing a Whisper tiny style configuration
     >>> configuration = WhisperConfig()
 
-    >>> # Initializing a model from the s2t_transformer_s style configuration
+    >>> # Initializing a model from the tiny style configuration
     >>> model = WhisperModel(configuration)
 
     >>> # Accessing the model configuration
@@ -132,6 +132,8 @@ def __init__(
         init_std=0.02,
         decoder_start_token_id=2,
         scale_embedding=False,
+        max_source_positions=1498,
+        max_target_positions=448,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
@@ -160,7 +162,8 @@ def __init__(
         self.tie_word_embeddings = tie_word_embeddings
         self.input_feat_per_channel = input_feat_per_channel
         self.input_channels = input_channels
-
+        self.max_source_positions=max_source_positions
+        self.max_target_positions=max_target_positions
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 4176ed3775837..47fdd8a25e3e2 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -17,19 +17,14 @@
 import torch
 from torch import nn
 
-from transformers import WhisperConfig, WhisperForConditionalGeneration
+from transformers import WhisperConfig, WhisperModel
 
 
 def remove_ignore_keys_(state_dict):
     ignore_keys = [
-        "encoder.version",
-        "decoder.version",
-        "model.encoder.version",
-        "model.decoder.version",
-        "decoder.output_projection.weight",
-        "_float_tensor",
-        "encoder.embed_positions._float_tensor",
-        "decoder.embed_positions._float_tensor",
+        "layers",
+        "blocks",
+        "proj_out.weight"
     ]
     for k in ignore_keys:
         state_dict.pop(k, None)
@@ -38,32 +33,39 @@ def remove_ignore_keys_(state_dict):
 WHISPER_MAPPING = {
     "blocks" : "layers",
     "mlp.0":"fc1",
-    "mlp:2":"fc2",
-    "attn_ln":"self_attn_layer_norm",
-    "attn.out":"self_attn.out_proj",
+    "mlp.2":"fc2",
     "mlp_ln":"final_layer_norm",
     "blocks":"layers",
-    "attn.query":"self_attn.q_proj",
-    "attn.key":"self_attn.k_proj",
-    "attn.value":"self_attn.v_proj"
-
+    ".attn.query":".self_attn.q_proj",
+    ".attn.key":".self_attn.k_proj",
+    ".attn.value":".self_attn.v_proj",
+    ".attn_ln":".self_attn_layer_norm",
+    ".attn.out":".self_attn.out_proj",
+    ".cross_attn.query":".encoder_attn.q_proj",
+    ".cross_attn.key":".encoder_attn.k_proj",
+    ".cross_attn.value":".encoder_attn.v_proj",
+    ".cross_attn_ln":".encoder_attn_layer_norm",
+    ".cross_attn.out":".encoder_attn.out_proj",
+    "decoder.ln.":"decoder.layer_norm.",
+    "encoder.ln.":"encoder.layer_norm.",
+    "token_embedding":"embed_tokens",
+    "encoder.positional_embedding":"encoder.embed_positions.weights",
+    "decoder.positional_embedding":"decoder.embed_positions.weight",
+    "ln_post":"layer_norm"
 }
 
 def rename_keys(s_dict):
     keys = list(s_dict.keys())
     for key in keys:
-        if "blocks" in key:
-            s_dict[key.replace("blocks", "layers")] = s_dict.pop(key)
-            s_dict[key.replace("attn.query", "self_attn.q_proj")] = s_dict.pop(key)
-            s_dict[key.replace("attn.key", "self_k.q_proj")] = s_dict.pop(key)
-            s_dict[key.replace("attn.value", "self_attn.v_proj")] = s_dict.pop(key)
-
-
+        new_key = key 
+        for k,v in WHISPER_MAPPING.items():
+            if k in key:
+                new_key = new_key.replace(k, v)
 
+        print(f"{key} -> {new_key}")
 
-        elif "subsample" in key:
-            s_dict[key.replace("subsample", "conv")] = s_dict.pop(key)
-
+        s_dict[new_key] = s_dict.pop(key)
+    return s_dict
 
 def make_linear_from_emb(emb):
     vocab_size, emb_size = emb.weight.shape
@@ -134,26 +136,99 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
 
     model.save_pretrained(pytorch_dump_folder_path)
 
+_MODELS = {
+    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
+    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
+    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
+    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
+    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
+    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
+    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
+    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
+    "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
+}
+
+import hashlib
+import io
+import os
+import urllib
+import warnings
+from tqdm import tqdm
+
+def _download(url: str, root: str) -> bytes:
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+
+    if os.path.isfile(download_target):
+        model_bytes = open(download_target, "rb").read()
+        if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
+            return model_bytes
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    model_bytes = open(download_target, "rb").read()
+    if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
+        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model.")
+
+    return model_bytes
+
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    args = parser.parse_args()
-
-    tiny_config = WhisperConfig(
-        vocab_size = 51865,
-        encoder_layers = 4, 
-        encoder_attention_heads = 6,
-        decoder_attention_heads = 6,
-        decoder_layers = 4,
-        d_model = 384, 
-    )
+    # parser = argparse.ArgumentParser()
+    # # Required parameters
+    # parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
+    # parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    # args = parser.parse_args()
+    from transformers import WhisperConfig, WhisperModel
+    import torch 
+
+
+    layers = [4,6,12,24,32]
+    width = [384,512,768,1024,1280]
+    heads = [6, 8, 12, 16, 20]
+    name = ["tiny","base", "small","medium","large"]
+    for l,w,h,n in zip(layers, width, heads, name):
+        config = WhisperConfig(
+            vocab_size = 51865,
+            encoder_layers =l, 
+            encoder_attention_heads = h,
+            decoder_attention_heads = h,
+            decoder_layers = l,
+            d_model = w, 
+        )
+        model = WhisperModel(config)
+
+
+        model_bytes = _download(_MODELS[n], "weights")
+        with io.BytesIO(model_bytes) as fp:
+            original = torch.load(fp, map_location="cpu")["model_state_dict"]
+
+        # original = torch.load(f"/home/arthur_huggingface_co/whisper/tiny.pt")
+        new = rename_keys(original.copy())
+
+
+        missing, unexpected = model.load_state_dict(new, strict = False)
+        if missing == ["proj_out.weight"]: 
+            print("succesfully loaded")
 
 
 
 
 
 
-    convert_fairseq_s2t_checkpoint_to_tfms(args.fairseq_path, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index d737b1498b728..d66f45e7490f9 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -236,7 +236,7 @@ def __init__(
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False) # no bias in the k_proj in original code 
         self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
@@ -1126,8 +1126,9 @@ def custom_forward(*inputs):
     "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
     WHISPER_START_DOCSTRING,
 )
-# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextModel with Speech2Text->Whisper,SPEECH_TO_TEXT->WHISPER,facebook/s2t-small-librispeech-asr->openai/whisper-base
 class WhisperModel(WhisperPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"proj_out.weight"]
+
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
 

From 40c42ab711eb8dee52739cc2696a6b7dd37d831a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 14:07:14 +0000
Subject: [PATCH 008/156] update processor for correct padding

---
 .../models/whisper/processing_whisper.py      | 112 ++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 src/transformers/models/whisper/processing_whisper.py

diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
new file mode 100644
index 0000000000000..2eff12f89dad2
--- /dev/null
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Whisper
+"""
+import warnings
+from contextlib import contextmanager
+
+from ...processing_utils import ProcessorMixin
+
+
+class WhisperProcessor(ProcessorMixin):
+    r"""
+    Constructs a Whisper processor which wraps a Whisper feature extractor and a Whisper tokenizer into a single
+    processor.
+
+    [`WhisperProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`WhisperTokenizer`]. See
+    the [`~WhisperProcessor.__call__`] and [`~WhisperProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor (`WhisperFeatureExtractor`):
+            An instance of [`WhisperFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`WhisperTokenizer`):
+            An instance of [`WhisperTokenizer`]. The tokenizer is a required input.
+    """
+    feature_extractor_class = "WhisperFeatureExtractor"
+    tokenizer_class = "GPT2Tokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to WhisperFeatureExtractor's
+        [`~WhisperFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~WhisperProcessor.as_target_processor`] this method forwards all its arguments to WhisperTokenizer's
+        [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
+        Whisper.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False

From 792d964e63484eea3f944d2c3cfcfbe84410777a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 14:07:40 +0000
Subject: [PATCH 009/156] update feature extraction

---
 .../models/whisper/feature_extraction_whisper.py            | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 24dfcb34e437f..aa186a7475008 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -98,8 +98,10 @@ def _extract_fbank_features(
         log_spec = torch.clamp(mel_spec, min=1e-10).log10()
         log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
         log_spec = (log_spec + 4.0) / 4.0
+        
         return log_spec
 
+
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
@@ -194,7 +196,7 @@ def __call__(
             raw_speech = [raw_speech]
 
         # extract fbank features
-        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
+        features = [self._extract_fbank_features(waveform).permute(1,0) for waveform in raw_speech]
 
         # convert into correct format for padding
         encoded_inputs = BatchFeature({"input_features": features})
@@ -210,7 +212,7 @@ def __call__(
         )
 
         # make sure list is in array format
-        input_features = padded_inputs.get("input_features")
+        input_features = padded_inputs.get("input_features").permute(0,2,1)
         if isinstance(input_features[0], list):
             padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
 

From 339f95ca5baeae2a8ea1cf025a4b9422022a3a7b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 15:59:06 +0000
Subject: [PATCH 010/156] update integration test logits match

---
 src/transformers/models/whisper/__init__.py   | 104 +++++++
 .../models/whisper/configuration_whisper.py   |   8 +-
 .../whisper/convert_openai_whisper_to_tfms.py | 102 +++----
 .../whisper/feature_extraction_whisper.py     |  15 +-
 .../models/whisper/modeling_whisper.py        |  47 ++-
 .../models/whisper/tokenization_whisper.py    | 286 ++++++++++++++++++
 tests/models/whisper/test_modeling_whisper.py | 163 ++++++++--
 7 files changed, 605 insertions(+), 120 deletions(-)
 create mode 100644 src/transformers/models/whisper/__init__.py
 create mode 100644 src/transformers/models/whisper/tokenization_whisper.py

diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
new file mode 100644
index 0000000000000..2de334e5cbc07
--- /dev/null
+++ b/src/transformers/models/whisper/__init__.py
@@ -0,0 +1,104 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_sentencepiece_available,
+    is_speech_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig"],
+}
+
+try:
+    if not is_sentencepiece_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_whisper"] = ["WhisperTokenizer"]
+
+try:
+    if not is_speech_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_whisper"] = ["WhisperFeatureExtractor"]
+
+    if is_sentencepiece_available():
+        _import_structure["processing_whisper"] = ["WhisperProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_whisper"] = [
+        "WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "WhisperForConditionalGeneration",
+        "WhisperModel",
+        "WhisperPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig
+
+    try:
+        if not is_sentencepiece_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .tokenization_whisper import WhisperTokenizer
+
+    try:
+        if not is_speech_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_whisper import WhisperFeatureExtractor
+
+        if is_sentencepiece_available():
+            from .processing_whisper import WhisperProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_whisper import (
+            WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            WhisperForConditionalGeneration,
+            WhisperModel,
+            WhisperPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index d30f128843343..5d6b7d434073f 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -132,12 +132,11 @@ def __init__(
         init_std=0.02,
         decoder_start_token_id=2,
         scale_embedding=False,
-        max_source_positions=1498,
+        max_source_positions=1500,
         max_target_positions=448,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        input_feat_per_channel=80,
         input_channels=1,
         tie_word_embeddings=True,
         **kwargs
@@ -160,10 +159,9 @@ def __init__(
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
         self.tie_word_embeddings = tie_word_embeddings
-        self.input_feat_per_channel = input_feat_per_channel
         self.input_channels = input_channels
-        self.max_source_positions=max_source_positions
-        self.max_target_positions=max_target_positions
+        self.max_source_positions = max_source_positions
+        self.max_target_positions = max_target_positions
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 47fdd8a25e3e2..32022a48c93f4 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -21,44 +21,41 @@
 
 
 def remove_ignore_keys_(state_dict):
-    ignore_keys = [
-        "layers",
-        "blocks",
-        "proj_out.weight"
-    ]
+    ignore_keys = ["layers", "blocks", "proj_out.weight"]
     for k in ignore_keys:
         state_dict.pop(k, None)
 
 
 WHISPER_MAPPING = {
-    "blocks" : "layers",
-    "mlp.0":"fc1",
-    "mlp.2":"fc2",
-    "mlp_ln":"final_layer_norm",
-    "blocks":"layers",
-    ".attn.query":".self_attn.q_proj",
-    ".attn.key":".self_attn.k_proj",
-    ".attn.value":".self_attn.v_proj",
-    ".attn_ln":".self_attn_layer_norm",
-    ".attn.out":".self_attn.out_proj",
-    ".cross_attn.query":".encoder_attn.q_proj",
-    ".cross_attn.key":".encoder_attn.k_proj",
-    ".cross_attn.value":".encoder_attn.v_proj",
-    ".cross_attn_ln":".encoder_attn_layer_norm",
-    ".cross_attn.out":".encoder_attn.out_proj",
-    "decoder.ln.":"decoder.layer_norm.",
-    "encoder.ln.":"encoder.layer_norm.",
-    "token_embedding":"embed_tokens",
-    "encoder.positional_embedding":"encoder.embed_positions.weights",
-    "decoder.positional_embedding":"decoder.embed_positions.weight",
-    "ln_post":"layer_norm"
+    "blocks": "layers",
+    "mlp.0": "fc1",
+    "mlp.2": "fc2",
+    "mlp_ln": "final_layer_norm",
+    "blocks": "layers",
+    ".attn.query": ".self_attn.q_proj",
+    ".attn.key": ".self_attn.k_proj",
+    ".attn.value": ".self_attn.v_proj",
+    ".attn_ln": ".self_attn_layer_norm",
+    ".attn.out": ".self_attn.out_proj",
+    ".cross_attn.query": ".encoder_attn.q_proj",
+    ".cross_attn.key": ".encoder_attn.k_proj",
+    ".cross_attn.value": ".encoder_attn.v_proj",
+    ".cross_attn_ln": ".encoder_attn_layer_norm",
+    ".cross_attn.out": ".encoder_attn.out_proj",
+    "decoder.ln.": "decoder.layer_norm.",
+    "encoder.ln.": "encoder.layer_norm.",
+    "token_embedding": "embed_tokens",
+    "encoder.positional_embedding": "encoder.embed_positions.weight",
+    "decoder.positional_embedding": "decoder.embed_positions.weight",
+    "ln_post": "layer_norm",
 }
 
+
 def rename_keys(s_dict):
     keys = list(s_dict.keys())
     for key in keys:
-        new_key = key 
-        for k,v in WHISPER_MAPPING.items():
+        new_key = key
+        for k, v in WHISPER_MAPPING.items():
             if k in key:
                 new_key = new_key.replace(k, v)
 
@@ -67,6 +64,7 @@ def rename_keys(s_dict):
         s_dict[new_key] = s_dict.pop(key)
     return s_dict
 
+
 def make_linear_from_emb(emb):
     vocab_size, emb_size = emb.weight.shape
     lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
@@ -136,6 +134,7 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
 
     model.save_pretrained(pytorch_dump_folder_path)
 
+
 _MODELS = {
     "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
     "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
@@ -153,8 +152,10 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
 import os
 import urllib
 import warnings
+
 from tqdm import tqdm
 
+
 def _download(url: str, root: str) -> bytes:
     os.makedirs(root, exist_ok=True)
     filename = os.path.basename(url)
@@ -173,7 +174,9 @@ def _download(url: str, root: str) -> bytes:
             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
 
     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
-        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+        with tqdm(
+            total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
+        ) as loop:
             while True:
                 buffer = source.read(8192)
                 if not buffer:
@@ -184,7 +187,9 @@ def _download(url: str, root: str) -> bytes:
 
     model_bytes = open(download_target, "rb").read()
     if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
-        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model.")
+        raise RuntimeError(
+            "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
+        )
 
     return model_bytes
 
@@ -195,26 +200,25 @@ def _download(url: str, root: str) -> bytes:
     # parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
     # parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
     # args = parser.parse_args()
-    from transformers import WhisperConfig, WhisperModel
-    import torch 
+    import torch
 
+    from transformers import WhisperConfig, WhisperModel
 
-    layers = [4,6,12,24,32]
-    width = [384,512,768,1024,1280]
+    layers = [4, 6, 12, 24, 32]
+    width = [384, 512, 768, 1024, 1280]
     heads = [6, 8, 12, 16, 20]
-    name = ["tiny","base", "small","medium","large"]
-    for l,w,h,n in zip(layers, width, heads, name):
+    name = ["tiny", "base", "small", "medium", "large"]
+    for l, w, h, n in zip(layers, width, heads, name):
         config = WhisperConfig(
-            vocab_size = 51865,
-            encoder_layers =l, 
-            encoder_attention_heads = h,
-            decoder_attention_heads = h,
-            decoder_layers = l,
-            d_model = w, 
+            vocab_size=51865,
+            encoder_layers=l,
+            encoder_attention_heads=h,
+            decoder_attention_heads=h,
+            decoder_layers=l,
+            d_model=w,
         )
         model = WhisperModel(config)
 
-
         model_bytes = _download(_MODELS[n], "weights")
         with io.BytesIO(model_bytes) as fp:
             original = torch.load(fp, map_location="cpu")["model_state_dict"]
@@ -222,13 +226,7 @@ def _download(url: str, root: str) -> bytes:
         # original = torch.load(f"/home/arthur_huggingface_co/whisper/tiny.pt")
         new = rename_keys(original.copy())
 
-
-        missing, unexpected = model.load_state_dict(new, strict = False)
-        if missing == ["proj_out.weight"]: 
+        missing, unexpected = model.load_state_dict(new, strict=False)
+        if missing == []:
             print("succesfully loaded")
-
-
-
-
-
-
+            model.save_pretrained(f"whisper/{n}")
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index aa186a7475008..1e35189187f41 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -76,6 +76,8 @@ def __init__(
         self.hop_length = hop_length
         self.chunk_length = chunk_length
         self.return_attention_mask = True
+        self.n_samples = chunk_length * sampling_rate
+        self.nb_max_frame = self.n_samples // hop_length
 
         with np.load(mel_filter_file) as f:
             self.mel_filters = torch.from_numpy(f[f"mel_{self.num_mel_bins}"])
@@ -98,9 +100,8 @@ def _extract_fbank_features(
         log_spec = torch.clamp(mel_spec, min=1e-10).log10()
         log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
         log_spec = (log_spec + 4.0) / 4.0
-        
-        return log_spec
 
+        return log_spec
 
     def __call__(
         self,
@@ -196,23 +197,23 @@ def __call__(
             raw_speech = [raw_speech]
 
         # extract fbank features
-        features = [self._extract_fbank_features(waveform).permute(1,0) for waveform in raw_speech]
+        features = [self._extract_fbank_features(waveform).permute(1, 0) for waveform in raw_speech]
 
         # convert into correct format for padding
         encoded_inputs = BatchFeature({"input_features": features})
 
         padded_inputs = self.pad(
             encoded_inputs,
-            padding=padding,
-            max_length=max_length,
+            padding="max_length",
+            max_length=self.nb_max_frame,
             truncation=truncation,
             pad_to_multiple_of=pad_to_multiple_of,
             return_attention_mask=return_attention_mask,
             **kwargs,
         )
-
+        padded_inputs["input_features"] = padded_inputs["input_features"].permute(0, 2, 1)
         # make sure list is in array format
-        input_features = padded_inputs.get("input_features").permute(0,2,1)
+        input_features = padded_inputs.get("input_features")
         if isinstance(input_features[0], list):
             padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
 
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index d66f45e7490f9..a0a814badb8b8 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -236,7 +236,7 @@ def __init__(
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False) # no bias in the k_proj in original code 
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)  # no bias in the k_proj in original code
         self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
@@ -357,6 +357,7 @@ def forward(
 
         return attn_output, attn_weights_reshaped, past_key_value
 
+
 # Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextEncoderLayer with Speech2Text->Whisper
 class WhisperEncoderLayer(nn.Module):
     def __init__(self, config: WhisperConfig):
@@ -426,6 +427,7 @@ def forward(
 
         return outputs
 
+
 # Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextDecoderLayer with Speech2Text->Whisper
 class WhisperDecoderLayer(nn.Module):
     def __init__(self, config: WhisperConfig):
@@ -710,6 +712,8 @@ class WhisperEncoder(WhisperPreTrainedModel):
 
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
 
         embed_dim = config.d_model
         self.num_mel_bins = config.num_mel_bins
@@ -720,11 +724,14 @@ def __init__(self, config: WhisperConfig):
         self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
         self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
 
-        self.embed_positions = WhisperSinusoidalPositionalEmbedding(
-            self.max_source_positions,
-            embed_dim,
-            self.padding_idx,
-        )
+        # self.embed_positions = WhisperSinusoidalPositionalEmbedding(
+        #     self.max_source_positions,
+        #     embed_dim,
+        #     self.padding_idx,
+        # )
+
+        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
+
         self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
         self.layer_norm = nn.LayerNorm(config.d_model)
 
@@ -779,18 +786,10 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         inputs_embeds = F.gelu(self.conv1(input_features))
-        inputs_embeds = F.gelu(self.conv2(input_features))
+        inputs_embeds = F.gelu(self.conv2(inputs_embeds))
 
         inputs_embeds = inputs_embeds.permute(0, 2, 1)
-
-        # subsample attention mask if necessary
-        if attention_mask is not None:
-            attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
-            padding_mask = attention_mask.ne(1).long()
-        else:
-            padding_mask = torch.zeros(inputs_embeds.shape[:2], dtype=torch.long, device=inputs_embeds.device)
-
-        embed_pos = self.embed_positions(padding_mask)
+        embed_pos = self.embed_positions.weight
 
         hidden_states = inputs_embeds + embed_pos
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
@@ -1134,7 +1133,6 @@ def __init__(self, config: WhisperConfig):
 
         self.encoder = WhisperEncoder(config)
         self.decoder = WhisperDecoder(config)
-        self.proj_out = nn.Linear(config.d_model, config.vocab_size,bias=False)
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -1144,12 +1142,6 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.decoder.embed_tokens = value
 
-    def set_output_embeddings(self, new_embeddings):
-        self.proj_out = new_embeddings
-
-    def get_output_embeddings(self):
-        return self.proj_out
-
     def get_encoder(self):
         return self.encoder
 
@@ -1245,7 +1237,6 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        decoder_outputs = self.proj_out(decoder_outputs)
 
         if not return_dict:
             return decoder_outputs + encoder_outputs
@@ -1283,7 +1274,7 @@ class WhisperForConditionalGeneration(WhisperPreTrainedModel):
     def __init__(self, config: WhisperConfig):
         super().__init__(config)
         self.model = WhisperModel(config)
-        self.lm_head = nn.Linear(config.d_model, self.config.vocab_size, bias=False)
+        self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -1299,10 +1290,10 @@ def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
         return new_embeddings
 
     def get_output_embeddings(self):
-        return self.lm_head
+        return self.proj_out
 
     def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
+        self.proj_out = new_embeddings
 
     @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -1380,7 +1371,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        lm_logits = self.lm_head(outputs[0])
+        lm_logits = self.proj_out(outputs[0])
 
         loss = None
         if labels is not None:
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
new file mode 100644
index 0000000000000..96fa87a125381
--- /dev/null
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -0,0 +1,286 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Whisper."""
+import json
+import os
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import sentencepiece
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "spm_file": "sentencepiece.bpe.model",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/vocab.json",
+    },
+    "spm_file": {
+        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/sentencepiece.bpe.model"
+    },
+}
+
+MAX_MODEL_INPUT_SIZES = {
+    "openai/whisper-base": 1024,
+}
+
+MUSTC_LANGS = ["pt", "fr", "ru", "nl", "ro", "it", "es", "de"]
+
+LANGUAGES = {"mustc": MUSTC_LANGS}
+
+
+class WhisperTokenizer(PreTrainedTokenizer):
+    """
+    Construct an Whisper tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
+    the superclass for more information regarding such methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        spm_file (`str`):
+            Path to the [SentencePiece](https://github.com/google/sentencepiece) model file
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sentence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        do_upper_case (`bool`, *optional*, defaults to `False`):
+           Whether or not to uppercase the output when decoding.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        tgt_lang (`str`, *optional*):
+            A string representing the target language.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+        **kwargs
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = MAX_MODEL_INPUT_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    prefix_tokens: List[int] = []
+
+    def __init__(
+        self,
+        vocab_file,
+        spm_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        unk_token="<unk>",
+        do_upper_case=False,
+        do_lower_case=False,
+        tgt_lang=None,
+        lang_codes=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            do_upper_case=do_upper_case,
+            do_lower_case=do_lower_case,
+            tgt_lang=tgt_lang,
+            lang_codes=lang_codes,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+        self.do_upper_case = do_upper_case
+        self.do_lower_case = do_lower_case
+
+        self.encoder = load_json(vocab_file)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.spm_file = spm_file
+        self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
+
+        if lang_codes is not None:
+            self.lang_codes = lang_codes
+            self.langs = LANGUAGES[lang_codes]
+            self.lang_tokens = [f"<lang:{lang}>" for lang in self.langs]
+            self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"<lang:{lang}>") for lang in self.langs}
+
+            self._additional_special_tokens = self.lang_tokens
+            self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0]
+
+            self.set_tgt_lang_special_tokens(self._tgt_lang)
+        else:
+            self.lang_code_to_id = {}
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    @property
+    def tgt_lang(self) -> str:
+        return self._tgt_lang
+
+    @tgt_lang.setter
+    def tgt_lang(self, new_tgt_lang) -> None:
+        self._tgt_lang = new_tgt_lang
+        self.set_tgt_lang_special_tokens(new_tgt_lang)
+
+    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
+        """Reset the special tokens to the target language setting. prefix=[eos, tgt_lang_code] and suffix=[eos]."""
+        lang_code_id = self.lang_code_to_id[tgt_lang]
+        self.prefix_tokens = [lang_code_id]
+
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder[self.unk_token])
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the decoder."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = self.sp_model.decode(tokens)
+
+        if self.do_upper_case:
+            out_string = out_string.upper()
+        return out_string
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """Build model inputs from a sequence by appending eos_token_id."""
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1]
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def get_vocab(self) -> Dict:
+        vocab = self.encoder.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self) -> Dict:
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d: Dict) -> None:
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        save_dir = Path(save_directory)
+        assert save_dir.is_dir(), f"{save_directory} should be a directory"
+        vocab_save_path = save_dir / (
+            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+        )
+        spm_save_path = save_dir / (
+            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
+        )
+
+        save_json(self.encoder, vocab_save_path)
+
+        if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file):
+            copyfile(self.spm_file, spm_save_path)
+        elif not os.path.isfile(self.spm_file):
+            with open(spm_save_path, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (str(vocab_save_path), str(spm_save_path))
+
+
+def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
+    spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
+    spm.Load(str(path))
+    return spm
+
+
+def load_json(path: str) -> Union[Dict, List]:
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def save_json(data, path: str) -> None:
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index bdf6377c98613..5048eef741d89 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -736,45 +736,152 @@ def _load_datasamples(self, num_samples):
 
         return [x["array"] for x in speech_samples]
 
-    def test_generation_librispeech(self):
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+    def test_tiny_logits_librispeech(self):
+
+        from transformers import GPT2Tokenizer, WhisperFeatureExtractor, set_seed
+
+        torch_device = "cpu"
+        set_seed(0)
+        model = WhisperModel.from_pretrained("whisper/tiny")
         model.to(torch_device)
-        processor = self.default_processor
+
+        # processor = self.default_processor
 
         input_speech = self._load_datasamples(1)
 
-        input_features = processor(input_speech, return_tensors="pt").input_features.to(torch_device)
+        feaure_extractor = WhisperFeatureExtractor(
+            "/home/arthur_huggingface_co/whisper/whisper/assets/mel_filters.npz"
+        )
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        tokenizer.pad_token = 0
 
-        generated_ids = model.generate(input_features)
-        generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        processor = WhisperProcessor(feaure_extractor, tokenizer)
 
-        EXPECTED_TRANSCRIPTIONS = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
-        ]
-        self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS)
+        input_features = processor(
+            audio=input_speech, text="This part of the speech", return_tensors="pt"
+        ).input_features.to(torch_device)
+        labels = processor(audio=input_speech, text="This part of the speech", return_tensors="pt").labels.to(
+            torch_device
+        )
+        with torch.no_grad():
+            logits = model(
+                input_features,
+                decoder_input_ids=labels,
+                output_hidden_states=False,
+                output_attentions=False,
+                use_cache=False,
+            )
+
+        logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
+
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                8.8958,
+                4.0423,
+                9.8841,
+                9.8493,
+                10.0628,
+                4.8472,
+                9.0100,
+                5.7364,
+                5.9165,
+                7.6322,
+                3.1579,
+                10.7269,
+                6.9586,
+                10.1852,
+                5.4714,
+                8.2995,
+                4.7507,
+                6.6723,
+                7.2764,
+                7.1831,
+                7.0388,
+                7.2191,
+                6.2364,
+                6.2117,
+                5.8797,
+                2.8099,
+                6.8319,
+                5.7094,
+                0.6999,
+                6.8444,
+            ]
+        )
+
+        self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
-    def test_generation_librispeech_batched(self):
-        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+    def test_large_logits_librispeech(self):
+
+        from transformers import GPT2Tokenizer, WhisperFeatureExtractor, set_seed
+
+        torch_device = "cpu"
+        set_seed(0)
+        model = WhisperModel.from_pretrained("whisper/large")
         model.to(torch_device)
-        processor = self.default_processor
 
-        input_speech = self._load_datasamples(4)
+        # processor = self.default_processor
 
-        inputs = processor(input_speech, return_tensors="pt", padding=True)
+        input_speech = self._load_datasamples(1)
 
-        input_features = inputs.input_features.to(torch_device)
-        attention_mask = inputs.attention_mask.to(torch_device)
+        feaure_extractor = WhisperFeatureExtractor(
+            "/home/arthur_huggingface_co/whisper/whisper/assets/mel_filters.npz"
+        )
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        tokenizer.pad_token = 0
 
-        generated_ids = model.generate(input_features, attention_mask=attention_mask)
-        generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        processor = WhisperProcessor(feaure_extractor, tokenizer)
 
-        EXPECTED_TRANSCRIPTIONS = [
-            "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
-            "nor is mister cultar's manner less interesting than his matter",
-            "he tells us that at this festive season of the year with christmas and roast beef looming before us"
-            " similes drawn from eating and its results occur most readily to the mind",
-            "he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it"
-            " but little of rocky ithaca",
-        ]
+        input_features = processor(
+            audio=input_speech, text="This part of the speech", return_tensors="pt"
+        ).input_features.to(torch_device)
+        labels = processor(audio=input_speech, text="This part of the speech", return_tensors="pt").labels.to(
+            torch_device
+        )
+        with torch.no_grad():
+            logits = model(
+                input_features,
+                decoder_input_ids=labels,
+                output_hidden_states=False,
+                output_attentions=False,
+                use_cache=False,
+            )
+
+        logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
+
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                2.1807,
+                1.1505,
+                4.8049,
+                3.9549,
+                2.7182,
+                4.1885,
+                -0.4179,
+                2.8316,
+                2.0155,
+                2.2740,
+                2.6727,
+                1.3789,
+                0.5620,
+                2.2096,
+                1.6781,
+                2.8227,
+                1.4421,
+                0.9057,
+                1.3358,
+                2.2104,
+                2.7468,
+                2.0021,
+                2.6960,
+                1.5925,
+                2.2239,
+                1.9396,
+                4.0580,
+                5.7722,
+                4.8056,
+                4.2416,
+            ]
+        )
 
-        self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)
+        self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))

From 3a2627376c20086224d17fdd980f9f7335c204f8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 16:18:29 +0000
Subject: [PATCH 011/156] fmnt: off for the logits

---
 tests/models/whisper/test_modeling_whisper.py | 73 +++----------------
 1 file changed, 12 insertions(+), 61 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 5048eef741d89..f979952216255 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -774,41 +774,16 @@ def test_tiny_logits_librispeech(self):
 
         logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
 
+        # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                8.8958,
-                4.0423,
-                9.8841,
-                9.8493,
-                10.0628,
-                4.8472,
-                9.0100,
-                5.7364,
-                5.9165,
-                7.6322,
-                3.1579,
-                10.7269,
-                6.9586,
-                10.1852,
-                5.4714,
-                8.2995,
-                4.7507,
-                6.6723,
-                7.2764,
-                7.1831,
-                7.0388,
-                7.2191,
-                6.2364,
-                6.2117,
-                5.8797,
-                2.8099,
-                6.8319,
-                5.7094,
-                0.6999,
-                6.8444,
+                8.8958,  4.0423,  9.8841,  9.8493, 10.0628,  4.8472,  9.0100,  5.7364,
+                5.9165,  7.6322,  3.1579, 10.7269,  6.9586, 10.1852,  5.4714,  8.2995,
+                4.7507,  6.6723,  7.2764,  7.1831,  7.0388,  7.2191,  6.2364,  6.2117,
+                5.8797,  2.8099,  6.8319,  5.7094,  0.6999,  6.8444
             ]
         )
-
+        # fmt: on
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
     def test_large_logits_librispeech(self):
@@ -849,39 +824,15 @@ def test_large_logits_librispeech(self):
 
         logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
 
+        # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.1807,
-                1.1505,
-                4.8049,
-                3.9549,
-                2.7182,
-                4.1885,
-                -0.4179,
-                2.8316,
-                2.0155,
-                2.2740,
-                2.6727,
-                1.3789,
-                0.5620,
-                2.2096,
-                1.6781,
-                2.8227,
-                1.4421,
-                0.9057,
-                1.3358,
-                2.2104,
-                2.7468,
-                2.0021,
-                2.6960,
-                1.5925,
-                2.2239,
-                1.9396,
-                4.0580,
-                5.7722,
-                4.8056,
-                4.2416,
+                2.1807,  1.1505,  4.8049,  3.9549,  2.7182,  4.1885, -0.4179,  2.8316,
+                2.0155,  2.2740,  2.6727,  1.3789,  0.5620,  2.2096,  1.6781,  2.8227,
+                1.4421,  0.9057,  1.3358,  2.2104,  2.7468,  2.0021,  2.6960,  1.5925,
+                2.2239,  1.9396,  4.0580,  5.7722,  4.8056,  4.2416
             ]
         )
+        # fmt: on
 
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))

From ad5f990fa1c8b1bec4dd683154bb6b7499d7b197 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 16:42:58 +0000
Subject: [PATCH 012/156] on the fly mel bank

---
 .../models/whisper/configuration_whisper.py   |  2 +
 .../whisper/feature_extraction_whisper.py     | 57 ++++++++++++++++---
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 5d6b7d434073f..e29f165485439 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -115,6 +115,7 @@ class WhisperConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size=10000,
+        feature_size=1,
         num_mel_bins=80,
         encoder_layers=12,
         encoder_attention_heads=4,
@@ -162,6 +163,7 @@ def __init__(
         self.input_channels = input_channels
         self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
+        self.feature_size = feature_size
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 1e35189187f41..69433ef73df32 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -60,7 +60,6 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
 
     def __init__(
         self,
-        mel_filter_file,
         feature_size=80,
         sampling_rate=16000,
         num_mel_bins=80,
@@ -78,9 +77,57 @@ def __init__(
         self.return_attention_mask = True
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frame = self.n_samples // hop_length
+        
+        self.mel_filters = self.get_mel_filters(sampling_rate,n_fft,n_mels = num_mel_bins )
+    
+    def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
+        # Initialize the weights
+        n_mels = int(n_mels)
+        weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
 
-        with np.load(mel_filter_file) as f:
-            self.mel_filters = torch.from_numpy(f[f"mel_{self.num_mel_bins}"])
+        # Center freqs of each FFT bin
+        fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
+
+        # 'Center freqs' of mel bands - uniformly spaced between limits
+        min_mel = 0.0
+        max_mel = 45.245640471924965
+
+        mels = np.linspace(min_mel, max_mel, n_mels + 2)
+
+        mels = np.asanyarray(mels)
+
+        # Fill in the linear scale
+        f_min = 0.0
+        f_sp = 200.0 / 3
+        freqs = f_min + f_sp * mels
+
+        # And now the nonlinear scale
+        min_log_hz = 1000.0  # beginning of log region (Hz)
+        min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+        logstep = np.log(6.4) / 27.0  # step size for log region
+
+        # If we have vector data, vectorize
+        log_t = mels >= min_log_mel
+        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
+
+        mel_f = freqs
+
+        fdiff = np.diff(mel_f)
+        ramps = np.subtract.outer(mel_f, fftfreqs)
+
+        for i in range(n_mels):
+            # lower and upper slopes for all bins
+            lower = -ramps[i] / fdiff[i]
+            upper = ramps[i+2] / fdiff[i+1]
+
+            # .. then intersect them with each other and zero
+            weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+        return torch.from_numpy(weights)
 
     def _extract_fbank_features(
         self,
@@ -225,7 +272,3 @@ def __call__(
             padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
 
         return padded_inputs
-
-    def save_pretrained(self, pretrained_model_name_or_path, **kwargs):
-        super().save_pretrained(pretrained_model_name_or_path)
-        np.savez_compressed("mel_filters.npz", mel_80=self.mel_filters)

From d58b7a0c8cbac82f43704d22073ece6ec17369ef Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 16:43:20 +0000
Subject: [PATCH 013/156] small nit

---
 .../models/whisper/feature_extraction_whisper.py       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 69433ef73df32..d21fd8b03e4bc 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -77,9 +77,9 @@ def __init__(
         self.return_attention_mask = True
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frame = self.n_samples // hop_length
-        
-        self.mel_filters = self.get_mel_filters(sampling_rate,n_fft,n_mels = num_mel_bins )
-    
+
+        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=num_mel_bins)
+
     def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
         # Initialize the weights
         n_mels = int(n_mels)
@@ -118,13 +118,13 @@ def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
         for i in range(n_mels):
             # lower and upper slopes for all bins
             lower = -ramps[i] / fdiff[i]
-            upper = ramps[i+2] / fdiff[i+1]
+            upper = ramps[i + 2] / fdiff[i + 1]
 
             # .. then intersect them with each other and zero
             weights[i] = np.maximum(0, np.minimum(lower, upper))
 
         # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels])
+        enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
         weights *= enorm[:, np.newaxis]
 
         return torch.from_numpy(weights)

From c61258b247f0c2b21a02d5cb3beb284ea01fa6e7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 22 Sep 2022 16:43:28 +0000
Subject: [PATCH 014/156] update test

---
 tests/models/whisper/test_modeling_whisper.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index f979952216255..5b01a11df731f 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -749,9 +749,7 @@ def test_tiny_logits_librispeech(self):
 
         input_speech = self._load_datasamples(1)
 
-        feaure_extractor = WhisperFeatureExtractor(
-            "/home/arthur_huggingface_co/whisper/whisper/assets/mel_filters.npz"
-        )
+        feaure_extractor = WhisperFeatureExtractor()
         tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
         tokenizer.pad_token = 0
 
@@ -799,9 +797,7 @@ def test_large_logits_librispeech(self):
 
         input_speech = self._load_datasamples(1)
 
-        feaure_extractor = WhisperFeatureExtractor(
-            "/home/arthur_huggingface_co/whisper/whisper/assets/mel_filters.npz"
-        )
+        feaure_extractor = WhisperFeatureExtractor()
         tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
         tokenizer.pad_token = 0
 

From 6acc1311c21337ddb0dc2bc8020f8f0ab293a519 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 23 Sep 2022 07:42:11 +0000
Subject: [PATCH 015/156] update tokenizer

---
 .../models/whisper/tokenization_whisper.py    | 585 +++++++++++++-----
 1 file changed, 422 insertions(+), 163 deletions(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 96fa87a125381..abdd3b80044ee 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -17,29 +17,58 @@
 import os
 from pathlib import Path
 from shutil import copyfile
+from functools import lru_cache
+
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+import regex as re
+
 import sentencepiece
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils import PreTrainedTokenizer, AddedToken
 from ...utils import logging
 
 
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
 logger = logging.get_logger(__name__)
 
 SPIECE_UNDERLINE = "▁"
 
 VOCAB_FILES_NAMES = {
     "vocab_file": "vocab.json",
-    "spm_file": "sentencepiece.bpe.model",
+    "tokenizer_file": "tokenization.json",
+    "merges_file":"merges.txt"
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
         "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/vocab.json",
     },
-    "spm_file": {
-        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/sentencepiece.bpe.model"
+    "merges_file": {
+        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/merges_file.txt"
     },
 }
 
@@ -47,9 +76,140 @@
     "openai/whisper-base": 1024,
 }
 
-MUSTC_LANGS = ["pt", "fr", "ru", "nl", "ro", "it", "es", "de"]
 
-LANGUAGES = {"mustc": MUSTC_LANGS}
+
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "iw": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+}
+
+
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+}
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
 
 
 class WhisperTokenizer(PreTrainedTokenizer):
@@ -59,44 +219,24 @@ class WhisperTokenizer(PreTrainedTokenizer):
     This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
     the superclass for more information regarding such methods.
 
-    Args:
+     Args:
         vocab_file (`str`):
-            File containing the vocabulary.
-        spm_file (`str`):
-            Path to the [SentencePiece](https://github.com/google/sentencepiece) model file
-        bos_token (`str`, *optional*, defaults to `"<s>"`):
-            The beginning of sentence token.
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
-            The end of sentence token.
-        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (`str`, *optional*, defaults to `"<pad>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        do_upper_case (`bool`, *optional*, defaults to `False`):
-           Whether or not to uppercase the output when decoding.
-        do_lower_case (`bool`, *optional*, defaults to `False`):
-            Whether or not to lowercase the input when tokenizing.
-        tgt_lang (`str`, *optional*):
-            A string representing the target language.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-        **kwargs
-            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+            The end of sequence token.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -104,57 +244,142 @@ class WhisperTokenizer(PreTrainedTokenizer):
     max_model_input_sizes = MAX_MODEL_INPUT_SIZES
     model_input_names = ["input_ids", "attention_mask"]
 
-    prefix_tokens: List[int] = []
-
     def __init__(
         self,
         vocab_file,
-        spm_file,
-        bos_token="<s>",
-        eos_token="</s>",
-        pad_token="<pad>",
-        unk_token="<unk>",
-        do_upper_case=False,
-        do_lower_case=False,
-        tgt_lang=None,
-        lang_codes=None,
-        sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
-    ) -> None:
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
+        merges_file,
+        multilingual, 
+        task=None,
+        language=None,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token=None,
+        add_prefix_space=False,
+        add_bos_token=False,
+        **kwargs
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
         super().__init__(
+            errors=errors,
+            unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             pad_token=pad_token,
-            do_upper_case=do_upper_case,
-            do_lower_case=do_lower_case,
-            tgt_lang=tgt_lang,
-            lang_codes=lang_codes,
-            sp_model_kwargs=self.sp_model_kwargs,
+            add_prefix_space=add_prefix_space,
+            add_bos_token=add_bos_token,
             **kwargs,
         )
-        self.do_upper_case = do_upper_case
-        self.do_lower_case = do_lower_case
+        self.add_bos_token = add_bos_token
 
-        self.encoder = load_json(vocab_file)
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
-        self.spm_file = spm_file
-        self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+
+
+        specials = [
+            "<|startoftranscript|>",
+            *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+            "<|translate|>",
+            "<|transcribe|>",
+            "<|startoflm|>",
+            "<|startofprev|>",
+            "<|nocaptions|>",
+            "<|notimestamps|>",
+        ]
+
+        self.add_special_tokens(dict(additional_special_tokens=specials))
+        self.language = language
+        if language is not None :
+            additional_tokens = dict(zip(self.additional_special_tokens,self.additional_special_tokens_ids,))
+            self.language_token = additional_tokens[f"<|{self.language}|>"]
+        
+
+        translate = self.all_special_ids[-6]
+        transcribe = self.all_special_ids[-5]
+        sot_sequence = [self.all_special_ids[1]]
+        if language is not None:
+            sot_sequence.append(self.all_special_ids[1] + 1 + LANGUAGES.keys().index(language))
+        if task is not None:
+            sot_sequence.append(transcribe if task == "transcribe" else translate)
+        self.sot_sequence = sot_sequence
+        
 
-        if lang_codes is not None:
-            self.lang_codes = lang_codes
-            self.langs = LANGUAGES[lang_codes]
-            self.lang_tokens = [f"<lang:{lang}>" for lang in self.langs]
-            self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"<lang:{lang}>") for lang in self.langs}
+    @property
+    @lru_cache()
+    def all_language_tokens(self) -> Tuple[int]:
+        result = []
+        for token, token_id in zip(
+            self.additional_special_tokens,
+            self.additional_special_tokens_ids,
+        ):
+            if token.strip("<|>") in LANGUAGES:
+                result.append(token_id)
+        return tuple(result)
 
-            self._additional_special_tokens = self.lang_tokens
-            self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0]
+    @property
+    @lru_cache()
+    def all_language_codes(self) -> Tuple[str]:
+        return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
 
-            self.set_tgt_lang_special_tokens(self._tgt_lang)
-        else:
-            self.lang_code_to_id = {}
+    @property
+    @lru_cache()
+    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
+        return tuple(list(self.sot_sequence) + [self.no_timestamps])
+
+    @property
+    @lru_cache()
+    def non_speech_tokens(self) -> Tuple[int]:
+        """
+        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
+        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+
+        - ♪♪♪
+        - ( SPEAKING FOREIGN LANGUAGE )
+        - [DAVID] Hey there,
+
+        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
+        """
+
+        result = set()
+        symbols = list("'\"#()*+-/:;<=>@[\\]^_`{|}~「」『』")
+        symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
+
+        # symbols that may be a single token or multiple tokens depending on the tokenizer.
+        # In case they're multiple tokens, suppress the first token, which is safe because:
+        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
+        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
+        miscellaneous = set("♩♪♫♬♭♮♯")
+        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
+
+        for symbol in symbols + list(miscellaneous):
+            for tokens in [self.encode(symbol), self.encode(" " + symbol)]:
+                if len(tokens) == 1 or symbol in miscellaneous:
+                    result.add(tokens[0])
+
+        return tuple(sorted(result))
+
+    def _get_single_token_id(self, text) -> int:
+        tokens = self.tokenizer.encode(text)
+        assert len(tokens) == 1, f"{text} is not encoded as a single token"
+        return tokens[0]
 
     @property
     def vocab_size(self) -> int:
@@ -164,47 +389,68 @@ def vocab_size(self) -> int:
     def tgt_lang(self) -> str:
         return self._tgt_lang
 
-    @tgt_lang.setter
-    def tgt_lang(self, new_tgt_lang) -> None:
-        self._tgt_lang = new_tgt_lang
-        self.set_tgt_lang_special_tokens(new_tgt_lang)
 
-    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
-        """Reset the special tokens to the target language setting. prefix=[eos, tgt_lang_code] and suffix=[eos]."""
-        lang_code_id = self.lang_code_to_id[tgt_lang]
-        self.prefix_tokens = [lang_code_id]
-
-    def _tokenize(self, text: str) -> List[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        return self.encoder.get(token, self.encoder[self.unk_token])
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) in a token (str) using the decoder."""
-        return self.decoder.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = self.sp_model.decode(tokens)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
 
-        if self.do_upper_case:
-            out_string = out_string.upper()
-        return out_string
+        output = bos_token_ids + token_ids_0
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
-        """Build model inputs from a sequence by appending eos_token_id."""
         if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
+            return output
+
+        return output + bos_token_ids + token_ids_1
 
     def get_special_tokens_mask(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
         """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
 
         Args:
             token_ids_0 (`List[int]`):
@@ -217,70 +463,83 @@ def get_special_tokens_mask(
         Returns:
             `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
-
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
                 token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
             )
 
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1]
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    def get_vocab(self) -> Dict:
-        vocab = self.encoder.copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
+        if not self.add_bos_token:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
+            )
 
-    def __getstate__(self) -> Dict:
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0))
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
 
-    def __setstate__(self, d: Dict) -> None:
-        self.__dict__ = d
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
 
-        self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        save_dir = Path(save_directory)
-        assert save_dir.is_dir(), f"{save_directory} should be a directory"
-        vocab_save_path = save_dir / (
-            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
         )
-        spm_save_path = save_dir / (
-            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
         )
 
-        save_json(self.encoder, vocab_save_path)
-
-        if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file):
-            copyfile(self.spm_file, spm_save_path)
-        elif not os.path.isfile(self.spm_file):
-            with open(spm_save_path, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (str(vocab_save_path), str(spm_save_path))
-
-
-def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
-    spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
-    spm.Load(str(path))
-    return spm
-
-
-def load_json(path: str) -> Union[Dict, List]:
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def save_json(data, path: str) -> None:
-    with open(path, "w") as f:
-        json.dump(data, f, indent=2)
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+        input_ids = []
+        for is_user, text in conversation.iter_texts():
+            input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
+        if len(input_ids) > self.model_max_length:
+            input_ids = input_ids[-self.model_max_length :]
+        return input_ids

From 71b3be855fc1b409f4cbd5b82e61dd894b9bdf2d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 23 Sep 2022 07:42:40 +0000
Subject: [PATCH 016/156] nit feature extraction

---
 .../models/whisper/feature_extraction_whisper.py   | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index d21fd8b03e4bc..bb61507835611 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -153,8 +153,6 @@ def _extract_fbank_features(
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        padding: Union[bool, str, PaddingStrategy] = False,
-        max_length: Optional[int] = None,
         truncation: bool = False,
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
@@ -169,18 +167,6 @@ def __call__(
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrays or a list of list of float values.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
             truncation (`bool`):
                 Activates truncation to cut input sequences longer than *max_length* to *max_length*.
             pad_to_multiple_of (`int`, *optional*):

From b4983e43d229981d1783650a9e18685af48f1d91 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 23 Sep 2022 10:03:07 +0000
Subject: [PATCH 017/156] update

---
 .../models/whisper/tokenization_whisper.py    | 84 +++++++++++--------
 .../models/whisper/test_processor_whisper.py  |  2 +-
 .../whisper/test_tokenization_whisper.py      | 42 +++++-----
 3 files changed, 71 insertions(+), 57 deletions(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index abdd3b80044ee..71671204c0a7d 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -29,37 +29,13 @@
 from ...utils import logging
 
 
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
 
-logger = logging.get_logger(__name__)
 
 SPIECE_UNDERLINE = "▁"
 
 VOCAB_FILES_NAMES = {
     "vocab_file": "vocab.json",
-    "tokenizer_file": "tokenization.json",
+    "tokenizer_file": "tokenizer.json",
     "merges_file":"merges.txt"
 }
 
@@ -197,6 +173,31 @@ def bytes_to_unicode():
     "castilian": "es",
 }
 
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+logger = logging.get_logger(__name__)
 
 def get_pairs(word):
     """
@@ -248,9 +249,9 @@ def __init__(
         self,
         vocab_file,
         merges_file,
-        multilingual, 
+        multilingual=True,
         task=None,
-        language=None,
+        language="en",
         errors="replace",
         unk_token="<|endoftext|>",
         bos_token="<|endoftext|>",
@@ -306,17 +307,34 @@ def __init__(
         ]
 
         self.add_special_tokens(dict(additional_special_tokens=specials))
+
+        if language is not None:
+            language = language.lower()
+            if language not in LANGUAGES:
+                if language in TO_LANGUAGE_CODE:
+                    language = TO_LANGUAGE_CODE[language]
+                else:
+                    raise ValueError(f"Unsupported language: {language}")
+
+        if multilingual:
+            task = task or "transcribe"
+            language = language or "en"
+        else:
+            task = None
+            language = None
+
         self.language = language
-        if language is not None :
-            additional_tokens = dict(zip(self.additional_special_tokens,self.additional_special_tokens_ids,))
-            self.language_token = additional_tokens[f"<|{self.language}|>"]
-        
 
         translate = self.all_special_ids[-6]
         transcribe = self.all_special_ids[-5]
         sot_sequence = [self.all_special_ids[1]]
-        if language is not None:
-            sot_sequence.append(self.all_special_ids[1] + 1 + LANGUAGES.keys().index(language))
+
+        if language is not None :
+            additional_tokens = dict(zip(self.additional_special_tokens,self.additional_special_tokens_ids,))
+            self.language_token = additional_tokens[f"<|{self.language}|>"]
+            langs = tuple(LANGUAGES.keys())
+            sot_sequence.append(self.all_special_ids[1] + 1 + langs.index(language))
+                    
         if task is not None:
             sot_sequence.append(transcribe if task == "transcribe" else translate)
         self.sot_sequence = sot_sequence
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
index cf483e44a2fea..167f425c23463 100644
--- a/tests/models/whisper/test_processor_whisper.py
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -19,7 +19,7 @@
 from shutil import copyfile
 
 from transformers import WhisperTokenizer, is_speech_available
-from transformers.models.whisper.tokenization_whisper import VOCAB_FILES_NAMES, save_json
+from transformers.models.whisper.tokenization_whisper import VOCAB_FILES_NAMES
 from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, require_torchaudio
 from transformers.utils import FEATURE_EXTRACTOR_NAME
 
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index d568421b3200d..1cf7cf520743b 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -16,20 +16,14 @@
 from pathlib import Path
 from shutil import copyfile
 
-from transformers import SPIECE_UNDERLINE, is_sentencepiece_available
+from transformers import SPIECE_UNDERLINE
 from transformers.models.whisper import WhisperTokenizer
-from transformers.models.whisper.tokenization_whisper import VOCAB_FILES_NAMES, save_json
+from transformers.models.whisper.tokenization_whisper import VOCAB_FILES_NAMES
 from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
-SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
-
-if is_sentencepiece_available():
-    import sentencepiece as sp
-
-
 FR_CODE = 5
 ES_CODE = 10
 
@@ -43,20 +37,7 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
 
     def setUp(self):
         super().setUp()
-
-        spm_model = sp.SentencePieceProcessor()
-        spm_model.Load(SAMPLE_SP)
-        vocab = ["<s>", "<pad>", "</s>", "<unk>"]
-
-        vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        save_dir = Path(self.tmpdirname)
-        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
-        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
-            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
-
-        tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
+        tokenizer = WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny")
         tokenizer.save_pretrained(self.tmpdirname)
 
     def test_convert_token_and_id(self):
@@ -122,7 +103,7 @@ def test_tokenizer_integration(self):
 
 @require_sentencepiece
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
-    checkpoint_name = "valhalla/s2t_mustc_multilinguial_medium"
+    checkpoint_name = "/home/arthur_huggingface_co/transformers/whisper/tiny"
 
     french_text = "C'est trop cool"
     spanish_text = "Esto es genial"
@@ -132,6 +113,21 @@ def setUpClass(cls):
         cls.tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(cls.checkpoint_name)
         return cls
 
+    def test_tokenizer_equivalence(self):
+        text = "다람쥐 헌 쳇바퀴에 타고파"
+        multilingual_tokenizer =  WhisperTokenizer.from_pretrained(self.checkpoint_name, multi_lingual = True)
+        gpt2_tokenizer = WhisperTokenizer.from_pretrained(self.checkpoint_name, multi_lingual = False)
+
+        text = "다람쥐 헌 쳇바퀴에 타고파"
+        gpt2_tokens = gpt2_tokenizer.encode(text)
+        multilingual_tokens = multilingual_tokenizer.encode(text)
+
+
+        assert gpt2_tokenizer.decode(gpt2_tokens) == text
+        assert multilingual_tokenizer.decode(multilingual_tokens) == text
+        assert len(gpt2_tokens) > len(multilingual_tokens)
+
+
     def check_language_codes(self):
         self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)
         self.assertEqual(self.tokenizer.lang_code_to_id["ru"], 6)

From e66815aded6bfb2ea982bfa06c360935a6bb3d36 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 23 Sep 2022 10:08:14 +0000
Subject: [PATCH 018/156] update tokenizer test

---
 tests/models/whisper/test_tokenization_whisper.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 1cf7cf520743b..da3fd24b15d9b 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -115,8 +115,8 @@ def setUpClass(cls):
 
     def test_tokenizer_equivalence(self):
         text = "다람쥐 헌 쳇바퀴에 타고파"
-        multilingual_tokenizer =  WhisperTokenizer.from_pretrained(self.checkpoint_name, multi_lingual = True)
-        gpt2_tokenizer = WhisperTokenizer.from_pretrained(self.checkpoint_name, multi_lingual = False)
+        multilingual_tokenizer =  WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny-multy")
+        gpt2_tokenizer = WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny")
 
         text = "다람쥐 헌 쳇바퀴에 타고파"
         gpt2_tokens = gpt2_tokenizer.encode(text)
@@ -127,6 +127,16 @@ def test_tokenizer_equivalence(self):
         assert multilingual_tokenizer.decode(multilingual_tokens) == text
         assert len(gpt2_tokens) > len(multilingual_tokens)
 
+        EXPECTED_MULTI = [ 9835, 22855,   168,    98,   238, 13431,   234, 43517,   229, 47053,
+          169,   222, 19086, 19840,  1313, 17974]
+
+        EXPECTED_ENG = [46695,    97,   167,   252,   234,   168,    98,   238,   220,   169,
+          245,   234, 23821,   111,   229,   167,   108,   242,   169,   222,
+          112,   168,   245,   238,   220,   169,   225,   222,   166,   111,
+          254,   169,   234,   234]
+        
+        self.assertListEqual(gpt2_tokens, EXPECTED_ENG)
+        self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
 
     def check_language_codes(self):
         self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)

From a980ccc9eb02d0ab8732db2777ae2f931801ce0f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 23 Sep 2022 14:57:43 +0000
Subject: [PATCH 019/156] adds logit processor and update tokenizer to get
 supress tokens

---
 src/transformers/generation_logits_process.py |  73 +++++++++++
 .../models/whisper/configuration_whisper.py   |   2 +-
 .../models/whisper/modeling_whisper.py        |  36 +++++-
 .../models/whisper/tokenization_whisper.py    | 122 ++++++++++++++----
 tests/models/whisper/test_modeling_whisper.py |  17 +++
 .../whisper/test_tokenization_whisper.py      |  24 ++++
 6 files changed, 247 insertions(+), 27 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 35ca6c57311d6..8e36abc21bc88 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -689,3 +689,76 @@ class LogitNormalization(LogitsProcessor, LogitsWarper):
     def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
         scores = scores.log_softmax(dim=-1)
         return scores
+
+class SuppressBlank(LogitsProcessor):
+    r"""
+    
+    """
+    def __init__(self, tokenizer, sample_begin: int = 1):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+
+    def __call__(self, input_ids, scores):
+        tokens = input_ids
+        logits = scores
+        if tokens.shape[1] == self.sample_begin:
+            logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
+        return logits
+
+
+class SuppressTokens(LogitsProcessor):
+    r"""
+    
+    """
+    def __init__(self, suppress_tokens):
+        self.suppress_tokens = list(suppress_tokens)
+
+    def __call__(self, input_ids, scores):
+        logits = scores
+        logits[:, self.suppress_tokens] = -np.inf
+        return logits
+
+
+class ApplyTimestampRules(LogitsProcessor):
+    r"""
+    
+    """
+    def __init__(
+        self, tokenizer, sample_begin: int = 1, max_initial_timestamp_index: Optional[int] = None
+    ):
+        self.tokenizer = tokenizer
+        self.sample_begin = sample_begin
+        self.max_initial_timestamp_index = max_initial_timestamp_index
+
+    def __call__(self, input_ids, scores):
+        tokens = input_ids
+        logits = scores
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        if self.tokenizer.no_timestamps is not None:
+            logits[:, self.tokenizer.no_timestamps] = -np.inf
+
+        # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
+        for k in range(tokens.shape[0]):
+            seq = [t for t in tokens[k, self.sample_begin :].tolist()]
+            last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
+            penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
+
+            if last_was_timestamp:
+                if penultimate_was_timestamp:  # has to be non-timestamp
+                    logits[k, self.tokenizer.timestamp_begin :] = -np.inf
+                else:  # cannot be normal text tokens
+                    logits[k, : self.tokenizer.eot] = -np.inf
+
+        # apply the `max_initial_timestamp` option
+        if tokens.shape[1] == self.sample_begin and self.max_initial_timestamp_index is not None:
+            last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
+            logits[:, last_allowed + 1 :] = -np.inf
+
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = torch.nn.functional.log_softmax(logits.float(), dim=-1)
+        for k in range(tokens.shape[0]):
+            timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1)
+            max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
+            if timestamp_logprob > max_text_token_logprob:
+                logits[k, : self.tokenizer.timestamp_begin] = -np.inf
+        return logits
\ No newline at end of file
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index e29f165485439..6ad81e1197d44 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -123,6 +123,7 @@ def __init__(
         decoder_attention_heads=4,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
+        decoder_start_token_id=(50258, 50259, 50359),
         use_cache=True,
         is_encoder_decoder=True,
         activation_function="gelu",
@@ -131,7 +132,6 @@ def __init__(
         attention_dropout=0.0,
         activation_dropout=0.0,
         init_std=0.02,
-        decoder_start_token_id=2,
         scale_embedding=False,
         max_source_positions=1500,
         max_target_positions=448,
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index a0a814badb8b8..f696f5faba45e 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -17,7 +17,7 @@
 
 import math
 import random
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, Optional, Tuple, Dict
 
 import torch
 import torch.nn.functional as F
@@ -1265,6 +1265,7 @@ class WhisperForConditionalGeneration(WhisperPreTrainedModel):
         r"decoder.version",
         r"model.encoder.embed_positions.weights",
         r"model.decoder.embed_positions.weights",
+        r"proj_out.weight"
     ]
     _keys_to_ignore_on_save = [
         r"model.encoder.embed_positions.weights",
@@ -1421,6 +1422,39 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
 
+    def _prepare_attention_mask_for_generation(
+        self,
+        inputs: torch.Tensor,
+        pad_token_id: Optional[int],
+        eos_token_id: Optional[int],
+    ) -> torch.LongTensor:
+        is_mel_spec = len(inputs.shape) == 3 and inputs.dtype in [torch.int, torch.long]
+        is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs)
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id != eos_token_id)
+
+        # Check if input is input_ids and padded -> only then is attention_mask defined
+        if is_mel_spec and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
+            return inputs.ne(pad_token_id).long()
+        else : 
+            return None
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        device: torch.device = None,
+    ) -> torch.LongTensor:
+
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            return model_kwargs.pop("decoder_input_ids")
+        else:
+            decoder_start_token_id = self.config.decoder_start_token_id
+            if device is None:
+                device = self.device
+            return torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
+
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 71671204c0a7d..61d8b42c46856 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -338,29 +338,7 @@ def __init__(
         if task is not None:
             sot_sequence.append(transcribe if task == "transcribe" else translate)
         self.sot_sequence = sot_sequence
-        
-
-    @property
-    @lru_cache()
-    def all_language_tokens(self) -> Tuple[int]:
-        result = []
-        for token, token_id in zip(
-            self.additional_special_tokens,
-            self.additional_special_tokens_ids,
-        ):
-            if token.strip("<|>") in LANGUAGES:
-                result.append(token_id)
-        return tuple(result)
-
-    @property
-    @lru_cache()
-    def all_language_codes(self) -> Tuple[str]:
-        return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
-
-    @property
-    @lru_cache()
-    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
-        return tuple(list(self.sot_sequence) + [self.no_timestamps])
+    
 
     @property
     @lru_cache()
@@ -395,10 +373,66 @@ def non_speech_tokens(self) -> Tuple[int]:
         return tuple(sorted(result))
 
     def _get_single_token_id(self, text) -> int:
-        tokens = self.tokenizer.encode(text)
-        assert len(tokens) == 1, f"{text} is not encoded as a single token"
+        tokens = self.encode(text)
         return tokens[0]
 
+    @property
+    @lru_cache()
+    def eot(self) -> int:
+        return self.tokenizer.eos_token_id
+
+    @property
+    @lru_cache()
+    def sot(self) -> int:
+        return self._get_single_token_id("<|startoftranscript|>")
+
+    @property
+    @lru_cache()
+    def sot_lm(self) -> int:
+        return self._get_single_token_id("<|startoflm|>")
+
+    @property
+    @lru_cache()
+    def sot_prev(self) -> int:
+        return self._get_single_token_id("<|startofprev|>")
+
+    @property
+    @lru_cache()
+    def no_captions(self) -> int:
+        return self._get_single_token_id("<|nocaptions|>")
+
+    @property
+    @lru_cache()
+    def no_timestamps(self) -> int:
+        return self._get_single_token_id("<|notimestamps|>")
+
+    @property
+    @lru_cache()
+    def timestamp_begin(self) -> int:
+        return self.tokenizer.all_special_ids[-1] + 1
+
+    @property
+    @lru_cache()
+    def all_language_tokens(self) -> Tuple[int]:
+        result = []
+        for token, token_id in zip(
+            self.additional_special_tokens,
+            self.additional_special_tokens_ids,
+        ):
+            if token.strip("<|>") in LANGUAGES:
+                result.append(token_id)
+        return tuple(result)
+
+    @property
+    @lru_cache()
+    def all_language_codes(self) -> Tuple[str]:
+        return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
+
+    @property
+    @lru_cache()
+    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
+        return tuple(list(self.sot_sequence) + [self.no_timestamps])
+
     @property
     def vocab_size(self) -> int:
         return len(self.encoder)
@@ -548,6 +582,22 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
 
         return vocab_file, merge_file
 
+    def decode_with_timestamps(self, tokens) -> str:
+        """
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
+        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        outputs = [[]]
+        for token in tokens:
+            if token >= self.timestamp_begin:
+                timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
+                outputs.append(timestamp)
+                outputs.append([])
+            else:
+                outputs[-1].append(token)
+        outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
+        return "".join(outputs)
+
     def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
         if is_split_into_words or add_prefix_space:
@@ -561,3 +611,25 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
         if len(input_ids) > self.model_max_length:
             input_ids = input_ids[-self.model_max_length :]
         return input_ids
+
+    
+    def _get_suppress_tokens(self, suppress_tokens = []) -> Tuple[int]:
+ 
+
+        if isinstance(suppress_tokens, str):
+            suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
+
+        if -1 in suppress_tokens:
+            suppress_tokens = [t for t in suppress_tokens if t >= 0]
+            suppress_tokens.extend(self.non_speech_tokens)
+        elif suppress_tokens is None or len(suppress_tokens) == 0:
+            suppress_tokens = []  # interpret empty string as an empty list
+
+        suppress_tokens.extend(
+            [self.sot, self.sot_prev, self.sot_lm]
+        )
+        if self.no_captions is not None:
+            # no-captions probability is collected separately
+            suppress_tokens.append(self.no_captions)
+
+        return tuple(sorted(set(suppress_tokens)))
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 5b01a11df731f..773f424b0e3ab 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -832,3 +832,20 @@ def test_large_logits_librispeech(self):
         # fmt: on
 
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+    def test_generationt(self):
+        from transformers import WhisperTokenizer, WhisperFeatureExtractor, set_seed
+
+        torch_device = "cpu"
+        set_seed(0)
+        model = WhisperForConditionalGeneration.from_pretrained("whisper/tiny")
+        model.to(torch_device)
+
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = WhisperFeatureExtractor()
+
+        input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        tokenizer = WhisperTokenizer.from_pretrained("whisper/tiny-multy")
+        generated_ids = model.generate(input_features, num_beams = 5)
+        transcript = tokenizer.batch_decode(generated_ids)
\ No newline at end of file
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index da3fd24b15d9b..b600be6ccbad6 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -138,6 +138,30 @@ def test_tokenizer_equivalence(self):
         self.assertListEqual(gpt2_tokens, EXPECTED_ENG)
         self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
 
+    def test_tokenizer_special(self):
+        multilingual_tokenizer =  WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny-multy")
+        text = "[Denis] Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
+        multilingual_tokens = multilingual_tokenizer.encode(text)
+
+        EXPECTED_MULTI = [   58,    35,   268,   271,    60,  1911,     0,  1012,   366,   291,
+         2633,    30,   508,     6,  1301,   287,     6, 36107,   631,   220,
+        11178,   115, 15567,   871, 44393]
+        self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
+
+        self.assertEqual(text,multilingual_tokenizer.decode(multilingual_tokens))
+
+        jp_tokenizer =  WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny-multy",multilingual=False, language = "japanese")
+        EXPECTED_JAP = [   58, 21306,   271,    60, 14690,     0,  1374,   389,   345,  4203,
+           30,   449,     6,  1872,   300,     6, 11011,  2234,  8358, 16268,
+          225,   115, 43357, 22174,  1556,   778, 25792,    83]
+
+        # parameters of the original tokenizer : multilingual False, language=Japanese
+        self.assertListEqual(jp_tokenizer.encode(text), EXPECTED_JAP)
+        
+        supress_tokens = multilingual_tokenizer.non_speech_tokens
+
+
+
     def check_language_codes(self):
         self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)
         self.assertEqual(self.tokenizer.lang_code_to_id["ru"], 6)

From 001dff24bbb05f8f964e9fdd0b142dda3a86a0df Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 23 Sep 2022 14:58:53 +0000
Subject: [PATCH 020/156] style

---
 src/transformers/generation_logits_process.py |  22 ++-
 .../models/whisper/modeling_whisper.py        |   6 +-
 .../models/whisper/tokenization_whisper.py    |  53 +++----
 tests/models/whisper/test_modeling_whisper.py |   6 +-
 .../whisper/test_tokenization_whisper.py      | 144 +++++++++++++++---
 5 files changed, 159 insertions(+), 72 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 8e36abc21bc88..e3937bbf273c5 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -690,10 +690,10 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
         scores = scores.log_softmax(dim=-1)
         return scores
 
+
 class SuppressBlank(LogitsProcessor):
-    r"""
-    
-    """
+    r""" """
+
     def __init__(self, tokenizer, sample_begin: int = 1):
         self.tokenizer = tokenizer
         self.sample_begin = sample_begin
@@ -707,9 +707,8 @@ def __call__(self, input_ids, scores):
 
 
 class SuppressTokens(LogitsProcessor):
-    r"""
-    
-    """
+    r""" """
+
     def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)
 
@@ -720,12 +719,9 @@ def __call__(self, input_ids, scores):
 
 
 class ApplyTimestampRules(LogitsProcessor):
-    r"""
-    
-    """
-    def __init__(
-        self, tokenizer, sample_begin: int = 1, max_initial_timestamp_index: Optional[int] = None
-    ):
+    r""" """
+
+    def __init__(self, tokenizer, sample_begin: int = 1, max_initial_timestamp_index: Optional[int] = None):
         self.tokenizer = tokenizer
         self.sample_begin = sample_begin
         self.max_initial_timestamp_index = max_initial_timestamp_index
@@ -761,4 +757,4 @@ def __call__(self, input_ids, scores):
             max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
             if timestamp_logprob > max_text_token_logprob:
                 logits[k, : self.tokenizer.timestamp_begin] = -np.inf
-        return logits
\ No newline at end of file
+        return logits
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index f696f5faba45e..b9e63176f2f08 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -17,7 +17,7 @@
 
 import math
 import random
-from typing import Iterable, Optional, Tuple, Dict
+from typing import Dict, Iterable, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -1265,7 +1265,7 @@ class WhisperForConditionalGeneration(WhisperPreTrainedModel):
         r"decoder.version",
         r"model.encoder.embed_positions.weights",
         r"model.decoder.embed_positions.weights",
-        r"proj_out.weight"
+        r"proj_out.weight",
     ]
     _keys_to_ignore_on_save = [
         r"model.encoder.embed_positions.weights",
@@ -1435,7 +1435,7 @@ def _prepare_attention_mask_for_generation(
         # Check if input is input_ids and padded -> only then is attention_mask defined
         if is_mel_spec and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
             return inputs.ne(pad_token_id).long()
-        else : 
+        else:
             return None
 
     def _prepare_decoder_input_ids_for_generation(
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 61d8b42c46856..212c41a08a9e5 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -15,37 +15,27 @@
 """Tokenization classes for Whisper."""
 import json
 import os
+from functools import lru_cache
 from pathlib import Path
 from shutil import copyfile
-from functools import lru_cache
-
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import regex as re
-
 import sentencepiece
 
-from ...tokenization_utils import PreTrainedTokenizer, AddedToken
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
 
-
-
 SPIECE_UNDERLINE = "▁"
 
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "tokenizer_file": "tokenizer.json",
-    "merges_file":"merges.txt"
-}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json", "merges_file": "merges.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
         "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/vocab.json",
     },
-    "merges_file": {
-        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/merges_file.txt"
-    },
+    "merges_file": {"openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/merges_file.txt"},
 }
 
 MAX_MODEL_INPUT_SIZES = {
@@ -53,7 +43,6 @@
 }
 
 
-
 LANGUAGES = {
     "en": "english",
     "zh": "chinese",
@@ -173,6 +162,7 @@
     "castilian": "es",
 }
 
+
 @lru_cache()
 def bytes_to_unicode():
     """
@@ -197,8 +187,10 @@ def bytes_to_unicode():
     cs = [chr(n) for n in cs]
     return dict(zip(bs, cs))
 
+
 logger = logging.get_logger(__name__)
 
+
 def get_pairs(word):
     """
     Return set of symbol pairs in a word.
@@ -293,8 +285,6 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
-
-
         specials = [
             "<|startoftranscript|>",
             *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
@@ -329,23 +319,27 @@ def __init__(
         transcribe = self.all_special_ids[-5]
         sot_sequence = [self.all_special_ids[1]]
 
-        if language is not None :
-            additional_tokens = dict(zip(self.additional_special_tokens,self.additional_special_tokens_ids,))
+        if language is not None:
+            additional_tokens = dict(
+                zip(
+                    self.additional_special_tokens,
+                    self.additional_special_tokens_ids,
+                )
+            )
             self.language_token = additional_tokens[f"<|{self.language}|>"]
             langs = tuple(LANGUAGES.keys())
             sot_sequence.append(self.all_special_ids[1] + 1 + langs.index(language))
-                    
+
         if task is not None:
             sot_sequence.append(transcribe if task == "transcribe" else translate)
         self.sot_sequence = sot_sequence
-    
 
     @property
     @lru_cache()
     def non_speech_tokens(self) -> Tuple[int]:
         """
-        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
-        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
+        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech annotations, to prevent
+        sampling texts that are not actually spoken in the audio, e.g.
 
         - ♪♪♪
         - ( SPEAKING FOREIGN LANGUAGE )
@@ -441,7 +435,6 @@ def vocab_size(self) -> int:
     def tgt_lang(self) -> str:
         return self._tgt_lang
 
-
     def bpe(self, token):
         if token in self.cache:
             return self.cache[token]
@@ -584,8 +577,8 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
 
     def decode_with_timestamps(self, tokens) -> str:
         """
-        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`.
-        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
+        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
         """
         outputs = [[]]
         for token in tokens:
@@ -612,9 +605,7 @@ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[in
             input_ids = input_ids[-self.model_max_length :]
         return input_ids
 
-    
-    def _get_suppress_tokens(self, suppress_tokens = []) -> Tuple[int]:
- 
+    def _get_suppress_tokens(self, suppress_tokens=[]) -> Tuple[int]:
 
         if isinstance(suppress_tokens, str):
             suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
@@ -625,9 +616,7 @@ def _get_suppress_tokens(self, suppress_tokens = []) -> Tuple[int]:
         elif suppress_tokens is None or len(suppress_tokens) == 0:
             suppress_tokens = []  # interpret empty string as an empty list
 
-        suppress_tokens.extend(
-            [self.sot, self.sot_prev, self.sot_lm]
-        )
+        suppress_tokens.extend([self.sot, self.sot_prev, self.sot_lm])
         if self.no_captions is not None:
             # no-captions probability is collected separately
             suppress_tokens.append(self.no_captions)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 773f424b0e3ab..dc01837825671 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -834,7 +834,7 @@ def test_large_logits_librispeech(self):
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
     def test_generationt(self):
-        from transformers import WhisperTokenizer, WhisperFeatureExtractor, set_seed
+        from transformers import WhisperFeatureExtractor, WhisperTokenizer, set_seed
 
         torch_device = "cpu"
         set_seed(0)
@@ -847,5 +847,5 @@ def test_generationt(self):
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
         tokenizer = WhisperTokenizer.from_pretrained("whisper/tiny-multy")
-        generated_ids = model.generate(input_features, num_beams = 5)
-        transcript = tokenizer.batch_decode(generated_ids)
\ No newline at end of file
+        generated_ids = model.generate(input_features, num_beams=5)
+        transcript = tokenizer.batch_decode(generated_ids)
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index b600be6ccbad6..c0563e32d1e0c 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -115,52 +115,154 @@ def setUpClass(cls):
 
     def test_tokenizer_equivalence(self):
         text = "다람쥐 헌 쳇바퀴에 타고파"
-        multilingual_tokenizer =  WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny-multy")
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
+            "/home/arthur_huggingface_co/transformers/whisper/tiny-multy"
+        )
         gpt2_tokenizer = WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny")
 
         text = "다람쥐 헌 쳇바퀴에 타고파"
         gpt2_tokens = gpt2_tokenizer.encode(text)
         multilingual_tokens = multilingual_tokenizer.encode(text)
 
-
         assert gpt2_tokenizer.decode(gpt2_tokens) == text
         assert multilingual_tokenizer.decode(multilingual_tokens) == text
         assert len(gpt2_tokens) > len(multilingual_tokens)
 
-        EXPECTED_MULTI = [ 9835, 22855,   168,    98,   238, 13431,   234, 43517,   229, 47053,
-          169,   222, 19086, 19840,  1313, 17974]
+        EXPECTED_MULTI = [
+            9835,
+            22855,
+            168,
+            98,
+            238,
+            13431,
+            234,
+            43517,
+            229,
+            47053,
+            169,
+            222,
+            19086,
+            19840,
+            1313,
+            17974,
+        ]
+
+        EXPECTED_ENG = [
+            46695,
+            97,
+            167,
+            252,
+            234,
+            168,
+            98,
+            238,
+            220,
+            169,
+            245,
+            234,
+            23821,
+            111,
+            229,
+            167,
+            108,
+            242,
+            169,
+            222,
+            112,
+            168,
+            245,
+            238,
+            220,
+            169,
+            225,
+            222,
+            166,
+            111,
+            254,
+            169,
+            234,
+            234,
+        ]
 
-        EXPECTED_ENG = [46695,    97,   167,   252,   234,   168,    98,   238,   220,   169,
-          245,   234, 23821,   111,   229,   167,   108,   242,   169,   222,
-          112,   168,   245,   238,   220,   169,   225,   222,   166,   111,
-          254,   169,   234,   234]
-        
         self.assertListEqual(gpt2_tokens, EXPECTED_ENG)
         self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
 
     def test_tokenizer_special(self):
-        multilingual_tokenizer =  WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny-multy")
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
+            "/home/arthur_huggingface_co/transformers/whisper/tiny-multy"
+        )
         text = "[Denis] Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
         multilingual_tokens = multilingual_tokenizer.encode(text)
 
-        EXPECTED_MULTI = [   58,    35,   268,   271,    60,  1911,     0,  1012,   366,   291,
-         2633,    30,   508,     6,  1301,   287,     6, 36107,   631,   220,
-        11178,   115, 15567,   871, 44393]
+        EXPECTED_MULTI = [
+            58,
+            35,
+            268,
+            271,
+            60,
+            1911,
+            0,
+            1012,
+            366,
+            291,
+            2633,
+            30,
+            508,
+            6,
+            1301,
+            287,
+            6,
+            36107,
+            631,
+            220,
+            11178,
+            115,
+            15567,
+            871,
+            44393,
+        ]
         self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
 
-        self.assertEqual(text,multilingual_tokenizer.decode(multilingual_tokens))
+        self.assertEqual(text, multilingual_tokenizer.decode(multilingual_tokens))
 
-        jp_tokenizer =  WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny-multy",multilingual=False, language = "japanese")
-        EXPECTED_JAP = [   58, 21306,   271,    60, 14690,     0,  1374,   389,   345,  4203,
-           30,   449,     6,  1872,   300,     6, 11011,  2234,  8358, 16268,
-          225,   115, 43357, 22174,  1556,   778, 25792,    83]
+        jp_tokenizer = WhisperTokenizer.from_pretrained(
+            "/home/arthur_huggingface_co/transformers/whisper/tiny-multy", multilingual=False, language="japanese"
+        )
+        EXPECTED_JAP = [
+            58,
+            21306,
+            271,
+            60,
+            14690,
+            0,
+            1374,
+            389,
+            345,
+            4203,
+            30,
+            449,
+            6,
+            1872,
+            300,
+            6,
+            11011,
+            2234,
+            8358,
+            16268,
+            225,
+            115,
+            43357,
+            22174,
+            1556,
+            778,
+            25792,
+            83,
+        ]
 
         # parameters of the original tokenizer : multilingual False, language=Japanese
         self.assertListEqual(jp_tokenizer.encode(text), EXPECTED_JAP)
-        
-        supress_tokens = multilingual_tokenizer.non_speech_tokens
-
 
+        supress_tokens = multilingual_tokenizer.non_speech_tokens
 
     def check_language_codes(self):
         self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)

From 81a7099532d77fa0a16fe951bb80654a268e7faa Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 23 Sep 2022 17:21:04 +0000
Subject: [PATCH 021/156] clean convert

---
 .../models/whisper/configuration_whisper.py   |  8 +-
 .../whisper/convert_openai_whisper_to_tfms.py | 73 ++++++++-----------
 .../whisper/feature_extraction_whisper.py     |  2 +-
 .../models/whisper/modeling_whisper.py        |  4 +-
 .../models/whisper/tokenization_whisper.py    |  7 +-
 tests/models/whisper/test_modeling_whisper.py | 16 ++--
 .../whisper/test_tokenization_whisper.py      |  2 -
 7 files changed, 46 insertions(+), 66 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 6ad81e1197d44..8dae86a89f650 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -117,7 +117,7 @@ def __init__(
         vocab_size=10000,
         feature_size=1,
         num_mel_bins=80,
-        encoder_layers=12,
+        encoder_layers=6,
         encoder_attention_heads=4,
         decoder_layers=6,
         decoder_attention_heads=4,
@@ -135,9 +135,9 @@ def __init__(
         scale_embedding=False,
         max_source_positions=1500,
         max_target_positions=448,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
+        pad_token_id=0,
+        bos_token_id=50256,
+        eos_token_id=50256,
         input_channels=1,
         tie_word_embeddings=True,
         **kwargs
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 32022a48c93f4..3229e12fffae4 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -17,11 +17,11 @@
 import torch
 from torch import nn
 
-from transformers import WhisperConfig, WhisperModel
+from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperModel
 
 
 def remove_ignore_keys_(state_dict):
-    ignore_keys = ["layers", "blocks", "proj_out.weight"]
+    ignore_keys = ["layers", "blocks"]
     for k in ignore_keys:
         state_dict.pop(k, None)
 
@@ -73,10 +73,10 @@ def make_linear_from_emb(emb):
 
 
 def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
-    m2m_100 = torch.load(checkpoint_path, map_location="cpu")
-    args = m2m_100["args"]
-    state_dict = m2m_100["model"]
-    lm_head_weights = state_dict["decoder.output_projection.weight"]
+    original_checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    dimensions = original_checkpoint["dims"]
+    state_dict = original_checkpoint["model_state_dict"]
+    proj_out_weights = state_dict["decoder.token_embedding.weight"]
 
     remove_ignore_keys_(state_dict)
     rename_keys(state_dict)
@@ -87,31 +87,15 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
 
     conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")]
     config = WhisperConfig(
-        vocab_size=vocab_size,
-        max_source_positions=args.max_source_positions,
-        max_target_positions=args.max_target_positions,
-        encoder_layers=args.encoder_layers,
-        decoder_layers=args.decoder_layers,
-        encoder_attention_heads=args.encoder_attention_heads,
-        decoder_attention_heads=args.decoder_attention_heads,
-        encoder_ffn_dim=args.encoder_ffn_embed_dim,
-        decoder_ffn_dim=args.decoder_ffn_embed_dim,
-        d_model=args.encoder_embed_dim,
-        dropout=args.dropout,
-        attention_dropout=args.attention_dropout,
-        activation_dropout=args.activation_dropout,
-        activation_function="relu",
-        num_conv_layers=len(conv_kernel_sizes),
-        conv_channels=args.conv_channels,
-        conv_kernel_sizes=conv_kernel_sizes,
-        input_feat_per_channel=args.input_feat_per_channel,
-        input_channels=args.input_channels,
-        tie_word_embeddings=tie_embeds,
-        num_beams=5,
-        max_length=200,
-        use_cache=True,
-        decoder_start_token_id=2,
-        early_stopping=True,
+        vocab_size=dimensions["n_vocab"],
+        num_mel_bins=dimensions["n°mels"],
+        d_model=dimensions["n_audio_state"],
+        max_target_positions=dimensions["n_text_ctx"],
+        encoder_layers=dimensions["n_audio_layers"],
+        encoder_attention_heads=dimensions["n_audio_heads"],
+        decoder_layers=dimensions["n_text_layers"],
+        decoder_attention_heads=dimensions["n_text_heads"],
+        max_source_positions=dimensions["n_audio_ctx"],
     )
 
     model = WhisperForConditionalGeneration(config)
@@ -128,9 +112,9 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
         )
 
     if tie_embeds:
-        model.lm_head = make_linear_from_emb(model.model.decoder.embed_tokens)
+        model.proj_out = make_linear_from_emb(model.model.decoder.embed_tokens)
     else:
-        model.lm_head.weight.data = lm_head_weights
+        model.proj_out.weight.data = proj_out_weights
 
     model.save_pretrained(pytorch_dump_folder_path)
 
@@ -194,16 +178,7 @@ def _download(url: str, root: str) -> bytes:
     return model_bytes
 
 
-if __name__ == "__main__":
-    # parser = argparse.ArgumentParser()
-    # # Required parameters
-    # parser.add_argument("--fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
-    # parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
-    # args = parser.parse_args()
-    import torch
-
-    from transformers import WhisperConfig, WhisperModel
-
+def convert_every_model(save_dir):
     layers = [4, 6, 12, 24, 32]
     width = [384, 512, 768, 1024, 1280]
     heads = [6, 8, 12, 16, 20]
@@ -229,4 +204,14 @@ def _download(url: str, root: str) -> bytes:
         missing, unexpected = model.load_state_dict(new, strict=False)
         if missing == []:
             print("succesfully loaded")
-            model.save_pretrained(f"whisper/{n}")
+            model.save_pretrained(f"{save_dir}/{n}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # # Required parameters
+    parser.add_argument("--original_name", type=str, help="Path to the fairseq model (.pt) file.")
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    args = parser.parse_args()
+
+    convert_openai_whisper_to_tfms(parser.original_name, parser.pytorch_dump_folder_path)
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index bb61507835611..277800036f188 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -23,7 +23,7 @@
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...utils import PaddingStrategy, TensorType, logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index b9e63176f2f08..ae3640cfb6c25 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -17,11 +17,11 @@
 
 import math
 import random
-from typing import Dict, Iterable, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
-from torch import Tensor, nn
+from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 212c41a08a9e5..da2b8a532427b 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -16,12 +16,9 @@
 import json
 import os
 from functools import lru_cache
-from pathlib import Path
-from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple
 
 import regex as re
-import sentencepiece
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
@@ -597,7 +594,7 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
             text = " " + text
         return (text, kwargs)
 
-    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+    def _build_conversation_input_ids(self, conversation) -> List[int]:
         input_ids = []
         for is_user, text in conversation.iter_texts():
             input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index dc01837825671..4dd2756be88b6 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -775,10 +775,10 @@ def test_tiny_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                8.8958,  4.0423,  9.8841,  9.8493, 10.0628,  4.8472,  9.0100,  5.7364,
-                5.9165,  7.6322,  3.1579, 10.7269,  6.9586, 10.1852,  5.4714,  8.2995,
-                4.7507,  6.6723,  7.2764,  7.1831,  7.0388,  7.2191,  6.2364,  6.2117,
-                5.8797,  2.8099,  6.8319,  5.7094,  0.6999,  6.8444
+                8.8958, 4.0423, 9.8841, 9.8493, 10.0628, 4.8472, 9.0100, 5.7364,
+                5.9165, 7.6322, 3.1579, 10.7269, 6.9586, 10.1852, 5.4714, 8.2995,
+                4.7507, 6.6723, 7.2764, 7.1831, 7.0388, 7.2191, 6.2364, 6.2117,
+                5.8797, 2.8099, 6.8319, 5.7094, 0.6999, 6.8444
             ]
         )
         # fmt: on
@@ -823,10 +823,10 @@ def test_large_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.1807,  1.1505,  4.8049,  3.9549,  2.7182,  4.1885, -0.4179,  2.8316,
-                2.0155,  2.2740,  2.6727,  1.3789,  0.5620,  2.2096,  1.6781,  2.8227,
-                1.4421,  0.9057,  1.3358,  2.2104,  2.7468,  2.0021,  2.6960,  1.5925,
-                2.2239,  1.9396,  4.0580,  5.7722,  4.8056,  4.2416
+                2.1807, 1.1505, 4.8049, 3.9549, 2.7182, 4.1885, -0.4179, 2.8316,
+                2.0155, 2.2740, 2.6727, 1.3789, 0.5620, 2.2096, 1.6781, 2.8227,
+                1.4421, 0.9057, 1.3358, 2.2104, 2.7468, 2.0021, 2.6960, 1.5925,
+                2.2239, 1.9396, 4.0580, 5.7722, 4.8056, 4.2416
             ]
         )
         # fmt: on
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index c0563e32d1e0c..34e4bef254b4f 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import unittest
-from pathlib import Path
-from shutil import copyfile
 
 from transformers import SPIECE_UNDERLINE
 from transformers.models.whisper import WhisperTokenizer

From cbf1b4a69071a59af292ca815af605c0d5bedfa0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 23 Sep 2022 17:28:28 +0000
Subject: [PATCH 022/156] revert to original modeling tf utils

---
 src/transformers/modeling_tf_utils.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 4928605e288fb..c0f15592866e7 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2213,25 +2213,18 @@ def save_pretrained(
             )
             for shard_file, shard in shards.items():
                 with h5py.File(os.path.join(save_directory, shard_file), mode="w") as shard_file:
-                    layers = []
-                    for layer in sorted(shard, key=lambda x: x.name):
-                        if "embed_tokens" in layer.name: 
-                            layer_name = layer.name
-                        if "model." in layer.name : 
-                            layer_name = layer.name
-                        else:
-                            layer_name = "/".join(layer.name.split("/")[1:])
-                        param_dset = shard_file.create_dataset(
-                                layer_name, layer.numpy().shape, dtype=layer.numpy().dtype
-                        )
-                        param_dset[:] = layer.numpy()
-                        layers.append(layer_name.encode("utf8"))
                     save_attributes_to_hdf5_group(
                         shard_file,
                         "layer_names",
-                        layers,
+                        ["/".join(layer.name.split("/")[1:]).encode("utf8") for layer in shard],
                     )
 
+                    for layer in sorted(shard, key=lambda x: x.name):
+                        param_dset = shard_file.create_dataset(
+                            "/".join(layer.name.split("/")[1:]), layer.numpy().shape, dtype=layer.numpy().dtype
+                        )
+                        param_dset[:] = layer.numpy()
+
         if push_to_hub:
             self._upload_modified_files(
                 save_directory, repo_id, files_timestamps, commit_message=commit_message, token=token
@@ -3061,6 +3054,7 @@ class TFWrappedEmbeddings:
     def __init__(self, layer, abs_scope_name=None):
         self._layer = layer
         self._abs_scope_name = abs_scope_name
+        self.vocab_size = self._layer.vocab_size
 
     def call(self, inputs, mode="embedding"):
         if self._abs_scope_name is None:

From 66237cb36e3addd24769d7d10e4c495f38bafa9c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 06:07:39 +0000
Subject: [PATCH 023/156] Update

---
 src/transformers/generation_logits_process.py |  7 +-
 .../models/whisper/configuration_whisper.py   | 19 +++-
 .../models/whisper/tokenization_whisper.py    | 90 ++++---------------
 .../models/whisper/test_processor_whisper.py  |  3 -
 .../whisper/test_tokenization_whisper.py      |  4 +-
 5 files changed, 39 insertions(+), 84 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index e3937bbf273c5..2d12e1c5d07a1 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -694,15 +694,16 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
 class SuppressBlank(LogitsProcessor):
     r""" """
 
-    def __init__(self, tokenizer, sample_begin: int = 1):
-        self.tokenizer = tokenizer
+    def __init__(self, blank_token_id, eos_token_id, sample_begin: int = 1):
+        self.blank_token_id = blank_token_id
+        self.eos_token_id = eos_token_id
         self.sample_begin = sample_begin
 
     def __call__(self, input_ids, scores):
         tokens = input_ids
         logits = scores
         if tokens.shape[1] == self.sample_begin:
-            logits[:, self.tokenizer.encode(" ") + [self.tokenizer.eot]] = -np.inf
+            logits[:, self.blank_token_id + [self.eos_token_id]] = -np.inf
         return logits
 
 
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 8dae86a89f650..81530a2e578a8 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -24,6 +24,20 @@
     "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/config.json",
 }
 
+# fmt: off
+NON_SPEECH_TOKENS = [    
+    1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
+    893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
+    3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
+    7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
+    14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
+    22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
+    42863, 47425, 49870, 50254
+]
+# fmt: on
+    
 
 class WhisperConfig(PretrainedConfig):
     r"""
@@ -136,8 +150,8 @@ def __init__(
         max_source_positions=1500,
         max_target_positions=448,
         pad_token_id=0,
-        bos_token_id=50256,
-        eos_token_id=50256,
+        bos_token_id=50258,
+        eos_token_id=50257,
         input_channels=1,
         tie_word_embeddings=True,
         **kwargs
@@ -164,6 +178,7 @@ def __init__(
         self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
         self.feature_size = feature_size
+        self.non_speech_tokens = NON_SPEECH_TOKENS
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index da2b8a532427b..6885e5aa0478b 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -22,6 +22,7 @@
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
+from .configuration_whisper import NON_SPEECH_TOKENS
 
 
 SPIECE_UNDERLINE = "▁"
@@ -238,7 +239,6 @@ def __init__(
         self,
         vocab_file,
         merges_file,
-        multilingual=True,
         task=None,
         language="en",
         errors="replace",
@@ -249,7 +249,8 @@ def __init__(
         add_prefix_space=False,
         add_bos_token=False,
         **kwargs
-    ):
+    ):        
+
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
@@ -265,6 +266,8 @@ def __init__(
             **kwargs,
         )
         self.add_bos_token = add_bos_token
+        self.language=language
+        self.task=task
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
@@ -282,41 +285,15 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
-        specials = [
-            "<|startoftranscript|>",
-            *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
-            "<|translate|>",
-            "<|transcribe|>",
-            "<|startoflm|>",
-            "<|startofprev|>",
-            "<|nocaptions|>",
-            "<|notimestamps|>",
-        ]
-
-        self.add_special_tokens(dict(additional_special_tokens=specials))
-
-        if language is not None:
-            language = language.lower()
-            if language not in LANGUAGES:
-                if language in TO_LANGUAGE_CODE:
-                    language = TO_LANGUAGE_CODE[language]
-                else:
-                    raise ValueError(f"Unsupported language: {language}")
-
-        if multilingual:
-            task = task or "transcribe"
-            language = language or "en"
-        else:
-            task = None
-            language = None
-
-        self.language = language
 
+    @property
+    @lru_cache()
+    def sot_sequence(self) -> Tuple[int]:
         translate = self.all_special_ids[-6]
         transcribe = self.all_special_ids[-5]
         sot_sequence = [self.all_special_ids[1]]
 
-        if language is not None:
+        if self.language is not None:
             additional_tokens = dict(
                 zip(
                     self.additional_special_tokens,
@@ -325,52 +302,16 @@ def __init__(
             )
             self.language_token = additional_tokens[f"<|{self.language}|>"]
             langs = tuple(LANGUAGES.keys())
-            sot_sequence.append(self.all_special_ids[1] + 1 + langs.index(language))
-
-        if task is not None:
-            sot_sequence.append(transcribe if task == "transcribe" else translate)
-        self.sot_sequence = sot_sequence
-
-    @property
-    @lru_cache()
-    def non_speech_tokens(self) -> Tuple[int]:
-        """
-        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech annotations, to prevent
-        sampling texts that are not actually spoken in the audio, e.g.
-
-        - ♪♪♪
-        - ( SPEAKING FOREIGN LANGUAGE )
-        - [DAVID] Hey there,
-
-        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
-        """
+            sot_sequence.append(self.all_special_ids[1] + 1 + langs.index(self.language))
 
-        result = set()
-        symbols = list("'\"#()*+-/:;<=>@[\\]^_`{|}~「」『』")
-        symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
-
-        # symbols that may be a single token or multiple tokens depending on the tokenizer.
-        # In case they're multiple tokens, suppress the first token, which is safe because:
-        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
-        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
-        miscellaneous = set("♩♪♫♬♭♮♯")
-        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
-
-        for symbol in symbols + list(miscellaneous):
-            for tokens in [self.encode(symbol), self.encode(" " + symbol)]:
-                if len(tokens) == 1 or symbol in miscellaneous:
-                    result.add(tokens[0])
-
-        return tuple(sorted(result))
+        if self.task is not None:
+            sot_sequence.append(transcribe if self.task == "transcribe" else translate)
+        return sot_sequence
 
     def _get_single_token_id(self, text) -> int:
         tokens = self.encode(text)
         return tokens[0]
 
-    @property
-    @lru_cache()
-    def eot(self) -> int:
-        return self.tokenizer.eos_token_id
 
     @property
     @lru_cache()
@@ -535,7 +476,7 @@ def _convert_token_to_id(self, token):
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
+        return self.decoder.get(index, self.decoder.get(self.unk_token_id))
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
@@ -602,6 +543,7 @@ def _build_conversation_input_ids(self, conversation) -> List[int]:
             input_ids = input_ids[-self.model_max_length :]
         return input_ids
 
+    # TODO move to the logit processor 
     def _get_suppress_tokens(self, suppress_tokens=[]) -> Tuple[int]:
 
         if isinstance(suppress_tokens, str):
@@ -609,7 +551,7 @@ def _get_suppress_tokens(self, suppress_tokens=[]) -> Tuple[int]:
 
         if -1 in suppress_tokens:
             suppress_tokens = [t for t in suppress_tokens if t >= 0]
-            suppress_tokens.extend(self.non_speech_tokens)
+            suppress_tokens.extend(NON_SPEECH_TOKENS)
         elif suppress_tokens is None or len(suppress_tokens) == 0:
             suppress_tokens = []  # interpret empty string as an empty list
 
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
index 167f425c23463..aaff4a2b13192 100644
--- a/tests/models/whisper/test_processor_whisper.py
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -30,9 +30,6 @@
     from transformers import WhisperFeatureExtractor, WhisperProcessor
 
 
-SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
-
-
 @require_torch
 @require_torchaudio
 @require_sentencepiece
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 34e4bef254b4f..3666bb564ad66 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -103,8 +103,8 @@ def test_tokenizer_integration(self):
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
     checkpoint_name = "/home/arthur_huggingface_co/transformers/whisper/tiny"
 
-    french_text = "C'est trop cool"
-    spanish_text = "Esto es genial"
+    transcript = "'<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Nor is Mr. Quilters manner less interesting than his matter.<|endoftext|>'"
+    clean_transcript = "  Nor is Mr. Quilters manner less interesting than his matter."
 
     @classmethod
     def setUpClass(cls):

From d2cfce36f85c72c9b23796bf62c462484d300470 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 09:42:30 +0000
Subject: [PATCH 024/156] update

---
 .../whisper/feature_extraction_whisper.py     |  4 ++--
 .../models/whisper/processing_whisper.py      | 19 +------------------
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 277800036f188..e920d4adfc816 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -127,7 +127,7 @@ def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
         enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
         weights *= enorm[:, np.newaxis]
 
-        return torch.from_numpy(weights)
+        return weights
 
     def _extract_fbank_features(
         self,
@@ -141,7 +141,7 @@ def _extract_fbank_features(
         stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
         magnitudes = stft[:, :-1].abs() ** 2
 
-        filters = self.mel_filters
+        filters = torch.from_numpy(self.mel_filters)
         mel_spec = filters @ magnitudes
 
         log_spec = torch.clamp(mel_spec, min=1e-10).log10()
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 2eff12f89dad2..319d7f5c1becf 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -36,7 +36,7 @@ class WhisperProcessor(ProcessorMixin):
             An instance of [`WhisperTokenizer`]. The tokenizer is a required input.
     """
     feature_extractor_class = "WhisperFeatureExtractor"
-    tokenizer_class = "GPT2Tokenizer"
+    tokenizer_class = "WhisperTokenizer"
 
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
@@ -93,20 +93,3 @@ def decode(self, *args, **kwargs):
         the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
-
-    @contextmanager
-    def as_target_processor(self):
-        """
-        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
-        Whisper.
-        """
-        warnings.warn(
-            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
-            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
-            "your audio inputs, or in a separate call."
-        )
-        self._in_target_context_manager = True
-        self.current_processor = self.tokenizer
-        yield
-        self.current_processor = self.feature_extractor
-        self._in_target_context_manager = False

From ca2a225827159a1c11e2904368ce053a5b9ecbe4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 09:43:06 +0000
Subject: [PATCH 025/156] nit

---
 tests/models/whisper/test_modeling_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 4dd2756be88b6..61989e91b58ec 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -833,7 +833,7 @@ def test_large_logits_librispeech(self):
 
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
-    def test_generationt(self):
+    def test_generation(self):
         from transformers import WhisperFeatureExtractor, WhisperTokenizer, set_seed
 
         torch_device = "cpu"

From 5afbaad69371acff1b7425bc32b73073e7a4239a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 20:11:30 +0000
Subject: [PATCH 026/156] clean convert file

---
 .../whisper/convert_openai_whisper_to_tfms.py | 44 +++++++++----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 3229e12fffae4..fe418390f60a6 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -72,29 +72,32 @@ def make_linear_from_emb(emb):
     return lin_layer
 
 
-def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
-    original_checkpoint = torch.load(checkpoint_path, map_location="cpu")
+def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, checkpoint_path = "weights"):
+    full_path = os.path.join(os.getcwd(),checkpoint_path)
+    if not os.path.isdir(os.path.join(full_path)):
+        os.makedirs(full_path, exist_ok=True)
+        try:
+            _ , checkpoint_path = _download(_MODELS[checkpoint_name],full_path )
+        except KeyError:
+            print("The original checkpoint should be in _MODELS ")
+
+    print(f"Loading model from : {full_path}/{checkpoint_name}")
+    original_checkpoint = torch.load(os.path.join(full_path,checkpoint_name)+".pt", map_location="cpu")
     dimensions = original_checkpoint["dims"]
     state_dict = original_checkpoint["model_state_dict"]
-    proj_out_weights = state_dict["decoder.token_embedding.weight"]
 
     remove_ignore_keys_(state_dict)
     rename_keys(state_dict)
 
-    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
-
-    tie_embeds = args.share_decoder_input_output_embed
-
-    conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")]
     config = WhisperConfig(
         vocab_size=dimensions["n_vocab"],
-        num_mel_bins=dimensions["n°mels"],
+        num_mel_bins=dimensions["n_mels"],
         d_model=dimensions["n_audio_state"],
         max_target_positions=dimensions["n_text_ctx"],
-        encoder_layers=dimensions["n_audio_layers"],
-        encoder_attention_heads=dimensions["n_audio_heads"],
-        decoder_layers=dimensions["n_text_layers"],
-        decoder_attention_heads=dimensions["n_text_heads"],
+        encoder_layers=dimensions["n_audio_layer"],
+        encoder_attention_heads=dimensions["n_audio_head"],
+        decoder_layers=dimensions["n_text_layer"],
+        decoder_attention_heads=dimensions["n_text_head"],
         max_source_positions=dimensions["n_audio_ctx"],
     )
 
@@ -111,12 +114,7 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
             f" but all the following weights are missing {missing}"
         )
 
-    if tie_embeds:
-        model.proj_out = make_linear_from_emb(model.model.decoder.embed_tokens)
-    else:
-        model.proj_out.weight.data = proj_out_weights
-
-    model.save_pretrained(pytorch_dump_folder_path)
+    model.save_pretrained(os.path.join(pytorch_dump_folder_path,checkpoint_name))
 
 
 _MODELS = {
@@ -153,7 +151,7 @@ def _download(url: str, root: str) -> bytes:
     if os.path.isfile(download_target):
         model_bytes = open(download_target, "rb").read()
         if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
-            return model_bytes
+            return None, download_target
         else:
             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
 
@@ -175,7 +173,7 @@ def _download(url: str, root: str) -> bytes:
             "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
         )
 
-    return model_bytes
+    return model_bytes, download_target
 
 
 def convert_every_model(save_dir):
@@ -211,7 +209,7 @@ def convert_every_model(save_dir):
     parser = argparse.ArgumentParser()
     # # Required parameters
     parser.add_argument("--original_name", type=str, help="Path to the fairseq model (.pt) file.")
-    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--pytorch_dump_folder_path", default="whisper-converted", type=str, help="Path to the output PyTorch model.")
     args = parser.parse_args()
 
-    convert_openai_whisper_to_tfms(parser.original_name, parser.pytorch_dump_folder_path)
+    convert_openai_whisper_to_tfms(args.original_name, args.pytorch_dump_folder_path)

From 9ce0bc9ed835cefff1944d9fc591cccf9f2e1ffc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 21:09:10 +0000
Subject: [PATCH 027/156] update tests and nits

---
 .../models/whisper/configuration_whisper.py   |  2 +-
 .../whisper/feature_extraction_whisper.py     |  6 +-
 .../models/whisper/modeling_whisper.py        | 93 ++-----------------
 tests/models/whisper/test_modeling_whisper.py | 56 ++++++-----
 4 files changed, 39 insertions(+), 118 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 81530a2e578a8..63969899e071a 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -150,7 +150,7 @@ def __init__(
         max_source_positions=1500,
         max_target_positions=448,
         pad_token_id=0,
-        bos_token_id=50258,
+        bos_token_id=50257,
         eos_token_id=50257,
         input_channels=1,
         tie_word_embeddings=True,
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index e920d4adfc816..9ddd95fb27e91 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -158,6 +158,8 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
         return_attention_mask: Optional[bool] = None,
+        padding: Optional[str]="max_length",
+        max_length: Optional[int]=None,
         **kwargs
     ) -> BatchFeature:
         """
@@ -237,8 +239,8 @@ def __call__(
 
         padded_inputs = self.pad(
             encoded_inputs,
-            padding="max_length",
-            max_length=self.nb_max_frame,
+            padding=padding,
+            max_length=max_length if max_length else self.nb_max_frame,
             truncation=truncation,
             pad_to_multiple_of=pad_to_multiple_of,
             return_attention_mask=return_attention_mask,
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ae3640cfb6c25..6deb262d70c33 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -130,77 +130,6 @@ def forward(self, input_features):
         return hidden_states
 
 
-# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextSinusoidalPositionalEmbedding with Speech2Text->Whisper
-class WhisperSinusoidalPositionalEmbedding(nn.Module):
-    """This module produces sinusoidal positional embeddings of any length."""
-
-    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
-        super().__init__()
-        self.offset = 2
-        self.embedding_dim = embedding_dim
-        self.padding_idx = padding_idx
-        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
-
-    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
-        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
-        if hasattr(self, "weights"):
-            # in forward put the weights on the correct dtype and device of the param
-            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
-
-        self.weights = nn.Parameter(emb_weights)
-        self.weights.requires_grad = False
-        self.weights.detach_()
-
-    @staticmethod
-    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
-        """
-        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
-        description in Section 3.5 of "Attention Is All You Need".
-        """
-        half_dim = embedding_dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
-        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
-        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
-        if embedding_dim % 2 == 1:
-            # zero pad
-            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
-        if padding_idx is not None:
-            emb[padding_idx, :] = 0
-        return emb
-
-    @torch.no_grad()
-    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
-        bsz, seq_len = input_ids.size()
-        # Create the position ids from the input token ids. Any padded tokens remain padded.
-        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
-            input_ids.device
-        )
-
-        # expand embeddings if needed
-        max_pos = self.padding_idx + 1 + seq_len
-        if max_pos > self.weights.size(0):
-            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
-
-        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
-
-    def create_position_ids_from_input_ids(
-        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
-    ):
-        """
-        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
-        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
-
-        Args:
-            x: torch.Tensor x:
-        Returns: torch.Tensor
-        """
-        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
-        mask = input_ids.ne(padding_idx).int()
-        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
-        return incremental_indices.long() + padding_idx
-
-
 class WhisperPositionalEmbedding(nn.Embedding):
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
         super().__init__(num_positions, embedding_dim)
@@ -571,8 +500,7 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
         """
         Computes the output length of the convolutional layers
         """
-        for i in range(self.config.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
+        input_lengths = (input_lengths - 1) // 2 + 1
 
         return input_lengths
 
@@ -724,12 +652,6 @@ def __init__(self, config: WhisperConfig):
         self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
         self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
 
-        # self.embed_positions = WhisperSinusoidalPositionalEmbedding(
-        #     self.max_source_positions,
-        #     embed_dim,
-        #     self.padding_idx,
-        # )
-
         self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
 
         self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
@@ -797,6 +719,8 @@ def forward(
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if attention_mask.shape[-1]>self.max_source_positions:
+                attention_mask = attention_mask[:,:self.max_source_positions]
             attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
 
         encoder_states = () if output_hidden_states else None
@@ -860,7 +784,6 @@ class WhisperDecoder(WhisperPreTrainedModel):
 
     Args:
         config: WhisperConfig
-        embed_tokens (nn.Embedding): output embedding
     """
 
     def __init__(self, config: WhisperConfig):
@@ -1263,13 +1186,13 @@ class WhisperForConditionalGeneration(WhisperPreTrainedModel):
     _keys_to_ignore_on_load_missing = [
         r"encoder.version",
         r"decoder.version",
-        r"model.encoder.embed_positions.weights",
-        r"model.decoder.embed_positions.weights",
+        r"model.encoder.embed_positions.weight",
+        r"model.decoder.embed_positions.weight",
         r"proj_out.weight",
     ]
     _keys_to_ignore_on_save = [
-        r"model.encoder.embed_positions.weights",
-        r"model.decoder.embed_positions.weights",
+        r"model.encoder.embed_positions.weight",
+        r"model.decoder.embed_positions.weight",
     ]
 
     def __init__(self, config: WhisperConfig):
@@ -1434,7 +1357,7 @@ def _prepare_attention_mask_for_generation(
 
         # Check if input is input_ids and padded -> only then is attention_mask defined
         if is_mel_spec and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
-            return inputs.ne(pad_token_id).long()
+            return inputs.ne(pad_token_id).long()[:,:,:self.max_source_positions,:self.max_source_positions]
         else:
             return None
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 61989e91b58ec..3cb32d5312ff4 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -40,7 +40,14 @@
 if is_torch_available():
     import torch
 
-    from transformers import WhisperForConditionalGeneration, WhisperModel, WhisperProcessor
+    from transformers import (
+        WhisperFeatureExtractor,
+        WhisperForConditionalGeneration,
+        WhisperModel,
+        WhisperProcessor,
+        WhisperTokenizer,
+        set_seed,
+    )
     from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder
 
 
@@ -82,28 +89,27 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        seq_length=7,
+        seq_length=30,
         is_training=True,
         use_labels=False,
         vocab_size=99,
         hidden_size=16,
         num_hidden_layers=2,
         num_attention_heads=4,
-        intermediate_size=4,
-        num_conv_layers=2,
-        conv_kernel_sizes=(5, 5),
-        conv_channels=32,
-        input_feat_per_channel=24,
         input_channels=1,
-        hidden_act="relu",
+        hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=20,
-        max_source_positions=20,
-        max_target_positions=20,
-        eos_token_id=2,
-        pad_token_id=1,
-        bos_token_id=0,
+        max_source_positions=15,
+        max_target_positions=4,
+        bos_token_id=50257,
+        eos_token_id=50257,
+        pad_token_id=0,
+        num_mel_bins=80,
+        decoder_start_token_id=(50258, 50259, 50359),
+        num_conv_layers=2
+
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -114,28 +120,26 @@ def __init__(
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.num_conv_layers = num_conv_layers
-        self.conv_kernel_sizes = conv_kernel_sizes
-        self.conv_channels = conv_channels
-        self.input_feat_per_channel = input_feat_per_channel
         self.input_channels = input_channels
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.num_mel_bins=num_mel_bins
         self.max_position_embeddings = max_position_embeddings
         self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
         self.eos_token_id = eos_token_id
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
+        self.decoder_start_token_id=decoder_start_token_id
+        self.num_conv_layers=num_conv_layers
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor(
-            [self.batch_size, self.seq_length, self.input_feat_per_channel], self.vocab_size
+            [self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size
         )
         attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
-        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(2)
+        decoder_input_ids = torch.tensor(self.decoder_start_token_id)
 
         config = self.get_config()
         inputs_dict = prepare_whisper_inputs_dict(
@@ -154,12 +158,6 @@ def get_config(self):
             decoder_layers=self.num_hidden_layers,
             encoder_attention_heads=self.num_attention_heads,
             decoder_attention_heads=self.num_attention_heads,
-            encoder_ffn_dim=self.intermediate_size,
-            decoder_ffn_dim=self.intermediate_size,
-            num_conv_layers=self.num_conv_layers,
-            conv_kernel_sizes=self.conv_kernel_sizes,
-            conv_channels=self.conv_channels,
-            input_feat_per_channel=self.input_feat_per_channel,
             input_channels=self.input_channels,
             dropout=self.hidden_dropout_prob,
             attention_dropout=self.attention_probs_dropout_prob,
@@ -721,7 +719,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 @require_torchaudio
 @require_sentencepiece
 @require_tokenizers
-@slow
+# @slow
 class WhisperModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_processor(self):
@@ -786,7 +784,6 @@ def test_tiny_logits_librispeech(self):
 
     def test_large_logits_librispeech(self):
 
-        from transformers import GPT2Tokenizer, WhisperFeatureExtractor, set_seed
 
         torch_device = "cpu"
         set_seed(0)
@@ -798,7 +795,7 @@ def test_large_logits_librispeech(self):
         input_speech = self._load_datasamples(1)
 
         feaure_extractor = WhisperFeatureExtractor()
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        tokenizer = WhisperTokenizer.from_pretrained("gpt2")
         tokenizer.pad_token = 0
 
         processor = WhisperProcessor(feaure_extractor, tokenizer)
@@ -834,7 +831,6 @@ def test_large_logits_librispeech(self):
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
     def test_generation(self):
-        from transformers import WhisperFeatureExtractor, WhisperTokenizer, set_seed
 
         torch_device = "cpu"
         set_seed(0)

From 16033a55dac6e95bdea88394856dbfc08f3265f1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 21:28:11 +0000
Subject: [PATCH 028/156] quality

---
 .../models/whisper/configuration_whisper.py   |  4 +--
 .../whisper/convert_openai_whisper_to_tfms.py | 28 +++++++++----------
 .../whisper/feature_extraction_whisper.py     |  4 +--
 .../models/whisper/modeling_whisper.py        |  6 ++--
 .../models/whisper/processing_whisper.py      |  1 -
 .../models/whisper/tokenization_whisper.py    | 10 +++----
 tests/models/whisper/test_modeling_whisper.py | 14 ++++------
 .../whisper/test_tokenization_whisper.py      | 10 +++----
 8 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 63969899e071a..96af4691b5acd 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -25,7 +25,7 @@
 }
 
 # fmt: off
-NON_SPEECH_TOKENS = [    
+NON_SPEECH_TOKENS = [
     1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
     26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
     63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
@@ -37,7 +37,7 @@
     42863, 47425, 49870, 50254
 ]
 # fmt: on
-    
+
 
 class WhisperConfig(PretrainedConfig):
     r"""
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index fe418390f60a6..9c70c759cd558 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -13,9 +13,15 @@
 # limitations under the License.
 
 import argparse
+import hashlib
+import io
+import os
+import urllib
+import warnings
 
 import torch
 from torch import nn
+from tqdm import tqdm
 
 from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperModel
 
@@ -72,17 +78,17 @@ def make_linear_from_emb(emb):
     return lin_layer
 
 
-def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, checkpoint_path = "weights"):
-    full_path = os.path.join(os.getcwd(),checkpoint_path)
+def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, checkpoint_path="weights"):
+    full_path = os.path.join(os.getcwd(), checkpoint_path)
     if not os.path.isdir(os.path.join(full_path)):
         os.makedirs(full_path, exist_ok=True)
         try:
-            _ , checkpoint_path = _download(_MODELS[checkpoint_name],full_path )
+            _, checkpoint_path = _download(_MODELS[checkpoint_name], full_path)
         except KeyError:
             print("The original checkpoint should be in _MODELS ")
 
     print(f"Loading model from : {full_path}/{checkpoint_name}")
-    original_checkpoint = torch.load(os.path.join(full_path,checkpoint_name)+".pt", map_location="cpu")
+    original_checkpoint = torch.load(os.path.join(full_path, checkpoint_name) + ".pt", map_location="cpu")
     dimensions = original_checkpoint["dims"]
     state_dict = original_checkpoint["model_state_dict"]
 
@@ -114,7 +120,7 @@ def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, ch
             f" but all the following weights are missing {missing}"
         )
 
-    model.save_pretrained(os.path.join(pytorch_dump_folder_path,checkpoint_name))
+    model.save_pretrained(os.path.join(pytorch_dump_folder_path, checkpoint_name))
 
 
 _MODELS = {
@@ -129,14 +135,6 @@ def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, ch
     "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
 }
 
-import hashlib
-import io
-import os
-import urllib
-import warnings
-
-from tqdm import tqdm
-
 
 def _download(url: str, root: str) -> bytes:
     os.makedirs(root, exist_ok=True)
@@ -209,7 +207,9 @@ def convert_every_model(save_dir):
     parser = argparse.ArgumentParser()
     # # Required parameters
     parser.add_argument("--original_name", type=str, help="Path to the fairseq model (.pt) file.")
-    parser.add_argument("--pytorch_dump_folder_path", default="whisper-converted", type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default="whisper-converted", type=str, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
 
     convert_openai_whisper_to_tfms(args.original_name, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 9ddd95fb27e91..2b17eea6508ec 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -158,8 +158,8 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
         return_attention_mask: Optional[bool] = None,
-        padding: Optional[str]="max_length",
-        max_length: Optional[int]=None,
+        padding: Optional[str] = "max_length",
+        max_length: Optional[int] = None,
         **kwargs
     ) -> BatchFeature:
         """
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 6deb262d70c33..ca2436bdcea79 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -719,8 +719,8 @@ def forward(
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            if attention_mask.shape[-1]>self.max_source_positions:
-                attention_mask = attention_mask[:,:self.max_source_positions]
+            if attention_mask.shape[-1] > self.max_source_positions:
+                attention_mask = attention_mask[:, : self.max_source_positions]
             attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
 
         encoder_states = () if output_hidden_states else None
@@ -1357,7 +1357,7 @@ def _prepare_attention_mask_for_generation(
 
         # Check if input is input_ids and padded -> only then is attention_mask defined
         if is_mel_spec and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
-            return inputs.ne(pad_token_id).long()[:,:,:self.max_source_positions,:self.max_source_positions]
+            return inputs.ne(pad_token_id).long()[:, :, : self.max_source_positions, : self.max_source_positions]
         else:
             return None
 
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 319d7f5c1becf..3f25a1a001c9f 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -16,7 +16,6 @@
 Speech processor class for Whisper
 """
 import warnings
-from contextlib import contextmanager
 
 from ...processing_utils import ProcessorMixin
 
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 6885e5aa0478b..b120bff20671a 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -249,7 +249,7 @@ def __init__(
         add_prefix_space=False,
         add_bos_token=False,
         **kwargs
-    ):        
+    ):
 
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -266,8 +266,8 @@ def __init__(
             **kwargs,
         )
         self.add_bos_token = add_bos_token
-        self.language=language
-        self.task=task
+        self.language = language
+        self.task = task
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
@@ -285,7 +285,6 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
-
     @property
     @lru_cache()
     def sot_sequence(self) -> Tuple[int]:
@@ -312,7 +311,6 @@ def _get_single_token_id(self, text) -> int:
         tokens = self.encode(text)
         return tokens[0]
 
-
     @property
     @lru_cache()
     def sot(self) -> int:
@@ -543,7 +541,7 @@ def _build_conversation_input_ids(self, conversation) -> List[int]:
             input_ids = input_ids[-self.model_max_length :]
         return input_ids
 
-    # TODO move to the logit processor 
+    # TODO move to the logit processor
     def _get_suppress_tokens(self, suppress_tokens=[]) -> Tuple[int]:
 
         if isinstance(suppress_tokens, str):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 3cb32d5312ff4..d0407f1fbdac3 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -108,8 +108,7 @@ def __init__(
         pad_token_id=0,
         num_mel_bins=80,
         decoder_start_token_id=(50258, 50259, 50359),
-        num_conv_layers=2
-
+        num_conv_layers=2,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -124,20 +123,18 @@ def __init__(
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.num_mel_bins=num_mel_bins
+        self.num_mel_bins = num_mel_bins
         self.max_position_embeddings = max_position_embeddings
         self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
         self.eos_token_id = eos_token_id
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
-        self.decoder_start_token_id=decoder_start_token_id
-        self.num_conv_layers=num_conv_layers
+        self.decoder_start_token_id = decoder_start_token_id
+        self.num_conv_layers = num_conv_layers
 
     def prepare_config_and_inputs(self):
-        input_features = floats_tensor(
-            [self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size
-        )
+        input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
         attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
         decoder_input_ids = torch.tensor(self.decoder_start_token_id)
 
@@ -784,7 +781,6 @@ def test_tiny_logits_librispeech(self):
 
     def test_large_logits_librispeech(self):
 
-
         torch_device = "cpu"
         set_seed(0)
         model = WhisperModel.from_pretrained("whisper/large")
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 3666bb564ad66..b9b88dccfc6ee 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -16,8 +16,7 @@
 
 from transformers import SPIECE_UNDERLINE
 from transformers.models.whisper import WhisperTokenizer
-from transformers.models.whisper.tokenization_whisper import VOCAB_FILES_NAMES
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -103,7 +102,10 @@ def test_tokenizer_integration(self):
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
     checkpoint_name = "/home/arthur_huggingface_co/transformers/whisper/tiny"
 
-    transcript = "'<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Nor is Mr. Quilters manner less interesting than his matter.<|endoftext|>'"
+    transcript = (
+        "'<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Nor is Mr. Quilters manner less interesting"
+        " than his matter.<|endoftext|>'"
+    )
     clean_transcript = "  Nor is Mr. Quilters manner less interesting than his matter."
 
     @classmethod
@@ -260,8 +262,6 @@ def test_tokenizer_special(self):
         # parameters of the original tokenizer : multilingual False, language=Japanese
         self.assertListEqual(jp_tokenizer.encode(text), EXPECTED_JAP)
 
-        supress_tokens = multilingual_tokenizer.non_speech_tokens
-
     def check_language_codes(self):
         self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)
         self.assertEqual(self.tokenizer.lang_code_to_id["ru"], 6)

From 1f9525549e6dfe72d556be395b2a2951219b02cd Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 21:31:52 +0000
Subject: [PATCH 029/156] slow generation test

---
 tests/models/whisper/test_modeling_whisper.py |  5 ++-
 .../models/whisper/test_processor_whisper.py  | 33 +++----------------
 2 files changed, 8 insertions(+), 30 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index d0407f1fbdac3..58b615460ca99 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -716,7 +716,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 @require_torchaudio
 @require_sentencepiece
 @require_tokenizers
-# @slow
+@slow
 class WhisperModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_processor(self):
@@ -841,3 +841,6 @@ def test_generation(self):
         tokenizer = WhisperTokenizer.from_pretrained("whisper/tiny-multy")
         generated_ids = model.generate(input_features, num_beams=5)
         transcript = tokenizer.batch_decode(generated_ids)
+
+        EXPECTED_TRANSCRIPT = "Nor is Mr. Quilters manner less interesting than his matter."
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
index aaff4a2b13192..e969c0f331474 100644
--- a/tests/models/whisper/test_processor_whisper.py
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -13,15 +13,10 @@
 # limitations under the License.
 
 import shutil
-import tempfile
 import unittest
-from pathlib import Path
-from shutil import copyfile
 
 from transformers import WhisperTokenizer, is_speech_available
-from transformers.models.whisper.tokenization_whisper import VOCAB_FILES_NAMES
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_torch, require_torchaudio
-from transformers.utils import FEATURE_EXTRACTOR_NAME
+from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
 
 from .test_feature_extraction_whisper import floats_list
 
@@ -35,33 +30,13 @@
 @require_sentencepiece
 class WhisperProcessorTest(unittest.TestCase):
     def setUp(self):
-        self.tmpdirname = tempfile.mkdtemp()
-
-        vocab = ["<s>", "<pad>", "</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est"]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        save_dir = Path(self.tmpdirname)
-        save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
-        if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
-            copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
-
-        tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
-        tokenizer.save_pretrained(self.tmpdirname)
-
-        feature_extractor_map = {
-            "feature_size": 24,
-            "num_mel_bins": 24,
-            "padding_value": 0.0,
-            "sampling_rate": 16000,
-            "return_attention_mask": False,
-            "do_normalize": True,
-        }
-        save_json(feature_extractor_map, save_dir / FEATURE_EXTRACTOR_NAME)
+        self.checkpoint = "ArthurZ/whisper-small.eng"
 
     def get_tokenizer(self, **kwargs):
-        return WhisperTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+        return WhisperTokenizer.from_pretrained(self.checkpoint, **kwargs)
 
     def get_feature_extractor(self, **kwargs):
-        return WhisperFeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
+        return WhisperFeatureExtractor.from_pretrained(self.checkpoint, **kwargs)
 
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)

From 830528a9d1b2759e2627d81758c143b429a28967 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 21:38:06 +0000
Subject: [PATCH 030/156] ffn_dim to allow customization

---
 docs/source/en/index.mdx                      |  2 ++
 .../models/whisper/configuration_whisper.py   |  4 ++++
 .../models/whisper/modeling_whisper.py        | 21 ++++++++--------
 src/transformers/utils/dummy_pt_objects.py    | 24 +++++++++++++++++++
 .../dummy_sentencepiece_and_speech_objects.py |  7 ++++++
 .../utils/dummy_sentencepiece_objects.py      |  7 ++++++
 .../utils/dummy_speech_objects.py             |  7 ++++++
 7 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index e6a3d912b2743..d37ab1cbc311c 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -180,6 +180,7 @@ The documentation is organized into five sections:
 1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
@@ -323,6 +324,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 |     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Whisper           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           X-CLIP            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            XGLM             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 96af4691b5acd..fb303ddf5ed35 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -135,6 +135,8 @@ def __init__(
         encoder_attention_heads=4,
         decoder_layers=6,
         decoder_attention_heads=4,
+        decoder_ffn_dim=4,
+        encoder_ffn_dim=4,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
         decoder_start_token_id=(50258, 50259, 50359),
@@ -163,6 +165,8 @@ def __init__(
         self.encoder_attention_heads = encoder_attention_heads
         self.decoder_layers = decoder_layers
         self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
         self.dropout = dropout
         self.attention_dropout = attention_dropout
         self.activation_dropout = activation_dropout
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ca2436bdcea79..0ce15ef97d24f 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -139,7 +139,6 @@ def forward(self, input_ids, past_key_values_length=0):
         return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]]
 
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Whisper
 class WhisperAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -165,7 +164,7 @@ def __init__(
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)  # no bias in the k_proj in original code
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
         self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
@@ -301,8 +300,8 @@ def __init__(self, config: WhisperConfig):
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
-        self.fc1 = nn.Linear(self.embed_dim, 4 * self.embed_dim)
-        self.fc2 = nn.Linear(4 * self.embed_dim, self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
@@ -336,11 +335,10 @@ def forward(
 
         residual = hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
-
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
 
         if hidden_states.dtype == torch.float16 and (
@@ -381,8 +379,8 @@ def __init__(self, config: WhisperConfig):
             is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, 4 * self.embed_dim)
-        self.fc2 = nn.Linear(4 * self.embed_dim, self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
     def forward(
@@ -438,6 +436,7 @@ def forward(
         if encoder_hidden_states is not None:
             residual = hidden_states
             hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
             cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
             hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
@@ -500,7 +499,8 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
         """
         Computes the output length of the convolutional layers
         """
-        input_lengths = (input_lengths - 1) // 2 + 1
+        for i in range(self.config.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
 
         return input_lengths
 
@@ -1180,7 +1180,6 @@ def forward(
     "The Whisper Model with a language modeling head. Can be used for summarization.",
     WHISPER_START_DOCSTRING,
 )
-# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextForConditionalGeneration with Speech2Text->Whisper,SPEECH_TO_TEXT->WHISPER,facebook/s2t-small-librispeech-asr->openai/whisper-base
 class WhisperForConditionalGeneration(WhisperPreTrainedModel):
     base_model_prefix = "model"
     _keys_to_ignore_on_load_missing = [
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index e9f1bae358f3a..6cbddad0f3612 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -5330,6 +5330,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+WHISPER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class WhisperForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class WhisperModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class WhisperPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
index b9b971f1f15c7..9e3cdf54ab76b 100644
--- a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
@@ -8,3 +8,10 @@ class Speech2TextProcessor(metaclass=DummyObject):
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece", "speech"])
+
+
+class WhisperProcessor(metaclass=DummyObject):
+    _backends = ["sentencepiece", "speech"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece", "speech"])
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 69f0bdcb7b1aa..633cfd3e031d0 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -164,6 +164,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
+class WhisperTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
 class XGLMTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index ae5589292a4cf..a2b4a40961b53 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -15,3 +15,10 @@ class Speech2TextFeatureExtractor(metaclass=DummyObject):
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["speech"])
+
+
+class WhisperFeatureExtractor(metaclass=DummyObject):
+    _backends = ["speech"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])

From 9fc86bca3c17fcbcebd3cc6a91637406ef34cd6b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 21:44:38 +0000
Subject: [PATCH 031/156] update readme

---
 README.md           | 1 +
 README_ko.md        | 1 +
 README_zh-hans.md   | 1 +
 README_zh-hant.md   | 1 +
 utils/check_repo.py | 2 ++
 5 files changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 5e17e33b204cc..fa3b28b5b45b1 100644
--- a/README.md
+++ b/README.md
@@ -390,6 +390,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
diff --git a/README_ko.md b/README_ko.md
index f53075ff5fe6f..372302b18a9af 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -340,6 +340,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 2843a8eb29a08..9f8199f41eb8a 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -364,6 +364,7 @@ conda install -c huggingface transformers
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (来自 Microsoft Research) 伴随论文 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 由 Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 发布。
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 8f74b97e98549..cac8a736ce61e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -376,6 +376,7 @@ conda install -c huggingface transformers
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 988967e797d12..d5f10f28ad748 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -49,6 +49,8 @@
     "DeformableDetrEncoder",  # Building part of bigger (tested) model.
     "DeformableDetrDecoder",  # Building part of bigger (tested) model.
     "OPTDecoder",  # Building part of bigger (tested) model.
+    "WhisperDecoder",  # Building part of bigger (tested) model.
+    "WhisperEncoder",  # Building part of bigger (tested) model.
     "DecisionTransformerGPT2Model",  # Building part of bigger (tested) model.
     "SegformerDecodeHead",  # Building part of bigger (tested) model.
     "PLBartEncoder",  # Building part of bigger (tested) model.

From 0497d0fa60466173501e9186ddf9efe318058cd7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sun, 25 Sep 2022 21:56:52 +0000
Subject: [PATCH 032/156] add to toctreee

---
 docs/source/en/_toctree.yml                              | 2 ++
 src/transformers/models/whisper/configuration_whisper.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 223c5d2a6998f..ea44c9730c957 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -472,6 +472,8 @@
         title: Speech Encoder Decoder Models
       - local: model_doc/trocr
         title: TrOCR
+      - local: model_doc/whisper
+        title: Whisper
       - local: model_doc/vilt
         title: ViLT
       - local: model_doc/vision-encoder-decoder
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index fb303ddf5ed35..e6892d2a4fa85 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -135,8 +135,8 @@ def __init__(
         encoder_attention_heads=4,
         decoder_layers=6,
         decoder_attention_heads=4,
-        decoder_ffn_dim=4,
-        encoder_ffn_dim=4,
+        decoder_ffn_dim=1536,
+        encoder_ffn_dim=1536,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
         decoder_start_token_id=(50258, 50259, 50359),

From 57cb28103132fd9835cc29b5178d380a65c01ad2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 05:47:48 +0000
Subject: [PATCH 033/156] start fixing integration tests

---
 .../models/whisper/configuration_whisper.py   |   4 +-
 .../whisper/convert_openai_whisper_to_tfms.py |  11 +-
 .../models/whisper/modeling_whisper.py        |  67 ++--------
 tests/models/whisper/test_modeling_whisper.py | 119 +++++++++++++++---
 4 files changed, 122 insertions(+), 79 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index e6892d2a4fa85..ba6102dea192a 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -128,7 +128,7 @@ class WhisperConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size=10000,
+        vocab_size=51865,
         feature_size=1,
         num_mel_bins=80,
         encoder_layers=6,
@@ -139,7 +139,7 @@ def __init__(
         encoder_ffn_dim=1536,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
-        decoder_start_token_id=(50258, 50259, 50359),
+        decoder_start_token_id=[50258, 50259, 50359],
         use_cache=True,
         is_encoder_decoder=True,
         activation_function="gelu",
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 9c70c759cd558..67edb86a92c4c 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -105,6 +105,8 @@ def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, ch
         decoder_layers=dimensions["n_text_layer"],
         decoder_attention_heads=dimensions["n_text_head"],
         max_source_positions=dimensions["n_audio_ctx"],
+        decoder_ffn_dim=4 * dimensions["n_audio_state"],
+        encoder_ffn_dim=4 * dimensions["n_audio_state"],
     )
 
     model = WhisperForConditionalGeneration(config)
@@ -174,23 +176,26 @@ def _download(url: str, root: str) -> bytes:
     return model_bytes, download_target
 
 
-def convert_every_model(save_dir):
+def convert_every_model(save_dir="whisper"):
     layers = [4, 6, 12, 24, 32]
     width = [384, 512, 768, 1024, 1280]
     heads = [6, 8, 12, 16, 20]
     name = ["tiny", "base", "small", "medium", "large"]
     for l, w, h, n in zip(layers, width, heads, name):
+        n += ".en"
         config = WhisperConfig(
-            vocab_size=51865,
+            vocab_size=51864,
             encoder_layers=l,
             encoder_attention_heads=h,
             decoder_attention_heads=h,
             decoder_layers=l,
             d_model=w,
+            decoder_ffn_dim=4*w,
+            encoder_ffn_dim=4*w,
         )
         model = WhisperModel(config)
 
-        model_bytes = _download(_MODELS[n], "weights")
+        model_bytes, _ = _download(_MODELS[n], "original-weights")
         with io.BytesIO(model_bytes) as fp:
             original = torch.load(fp, map_location="cpu")["model_state_dict"]
 
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 0ce15ef97d24f..ebfa78c8f8269 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -95,41 +95,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
 
-class Conv1dSubsampler(nn.Module):
-    """
-    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
-    via gated linear units (https://arxiv.org/abs/1911.08460)
-    """
-
-    def __init__(self, config):
-        super(Conv1dSubsampler, self).__init__()
-        self.config = config
-        self.num_layers = config.num_conv_layers
-        self.in_channels = config.input_feat_per_channel * config.input_channels
-        self.mid_channels = config.conv_channels
-        self.out_channels = config.d_model
-        self.kernel_sizes = config.conv_kernel_sizes
-
-        self.conv_layers = nn.ModuleList(
-            nn.Conv1d(
-                self.in_channels if i == 0 else self.mid_channels // 2,
-                self.mid_channels if i < self.num_layers - 1 else self.out_channels * 2,
-                kernel_size=k,
-                stride=2,
-                padding=k // 2,
-            )
-            for i, k in enumerate(self.kernel_sizes)
-        )
-
-    def forward(self, input_features):
-        hidden_states = input_features.transpose(1, 2).contiguous()  # -> B x (C x D) x T
-        for conv in self.conv_layers:
-            hidden_states = conv(hidden_states)
-            hidden_states = nn.functional.glu(hidden_states, dim=1)
-        hidden_states = hidden_states.transpose(1, 2).contiguous()  # -> T x B x (C x D)
-        return hidden_states
-
-
 class WhisperPositionalEmbedding(nn.Embedding):
     def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
         super().__init__(num_positions, embedding_dim)
@@ -494,23 +459,14 @@ def _init_weights(self, module):
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (WhisperDecoder, WhisperEncoder)):
             module.gradient_checkpointing = value
-
-    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
-        """
-        Computes the output length of the convolutional layers
-        """
-        for i in range(self.config.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
-
+    
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
         # convert it to 2D if thats the case
         if len(attention_mask.shape) > 2:
             attention_mask = attention_mask[:, :, -1]
 
-        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+        subsampled_lengths = (attention_mask.sum(-1)-1)//2 +1
         bsz = attention_mask.size()[0]
         attention_mask = torch.zeros(
             (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
@@ -716,12 +672,15 @@ def forward(
         hidden_states = inputs_embeds + embed_pos
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
-        # expand attention_mask
+
+        # subsample attention mask if necessary
         if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            if attention_mask.shape[-1] > self.max_source_positions:
-                attention_mask = attention_mask[:, : self.max_source_positions]
+            attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
+            attention_mask = attention_mask.ne(1).long()
             attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+        # else:
+        #     attention_mask = torch.ones([], dtype=torch.long, device=inputs_embeds.device)
+  
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -1139,9 +1098,7 @@ def forward(
 
         # downsample encoder attention mask
         if attention_mask is not None:
-            encoder_attention_mask = self._get_feature_vector_attention_mask(
-                encoder_outputs[0].shape[1], attention_mask
-            )
+            encoder_attention_mask =  attention_mask[  : , encoder_outputs[0].shape[1]]
         else:
             encoder_attention_mask = None
 
@@ -1372,10 +1329,10 @@ def _prepare_decoder_input_ids_for_generation(
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
             return model_kwargs.pop("decoder_input_ids")
         else:
-            decoder_start_token_id = self.config.decoder_start_token_id
+            decoder_start_token_id = list(self.config.decoder_start_token_id)
             if device is None:
                 device = self.device
-            return torch.ones((batch_size, 1), dtype=torch.long, device=device) * decoder_start_token_id
+            return torch.tensor(batch_size * [decoder_start_token_id], dtype=torch.long, device=device) 
 
     @staticmethod
     def _reorder_cache(past, beam_idx):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 58b615460ca99..b53b87367929c 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -89,7 +89,7 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        seq_length=30,
+        seq_length=60,
         is_training=True,
         use_labels=False,
         vocab_size=99,
@@ -170,6 +170,8 @@ def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict
 
+
+
     def get_subsampled_output_lengths(self, input_lengths):
         """
         Computes the output length of the convolutional layers
@@ -302,6 +304,25 @@ def test_encoder_decoder_model_standalone(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
         self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
 
+    def _get_input_ids_and_config(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict[self.input_name]
+
+        # cut to half length & take max batch_size 3
+        max_batch_size = 2
+        sequence_length = input_ids.shape[-1] // 2
+        input_ids = input_ids[:max_batch_size, : , :sequence_length]
+
+        # generate max 3 tokens
+        max_length = input_ids.shape[-1] + 3
+        if config.eos_token_id is not None and config.pad_token_id is None:
+            # hack to allow generate for models such as GPT2 as is done in `generate()`
+            config.pad_token_id = config.eos_token_id
+
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:max_batch_size, :sequence_length]
+
+        return config, input_ids, attention_mask, max_length
+
     # not implemented currently
     def test_inputs_embeds(self):
         pass
@@ -716,7 +737,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 @require_torchaudio
 @require_sentencepiece
 @require_tokenizers
-@slow
+# @slow
 class WhisperModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_processor(self):
@@ -731,35 +752,76 @@ def _load_datasamples(self, num_samples):
 
         return [x["array"] for x in speech_samples]
 
-    def test_tiny_logits_librispeech(self):
+    def test_inference_no_head(self):
+        torch_device = "cpu"
+        set_seed(0)
+        model = WhisperModel.from_pretrained("whisper/tiny")
+        model.to(torch_device)
+        input_speech = self._load_datasamples(1)[-1]
+        feature_extractor = WhisperFeatureExtractor()
+        input_features = feature_extractor(input_speech).input_features
+        with torch.no_grad():
+            logits = model(
+                input_features,
+                decoder_input_ids=torch.tensor([[50258, 50259, 50359]]),
+                output_hidden_states=False,
+                output_attentions=False,
+                return_dict=False,
+                use_cache=False,
+            )
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                3.0952, -7.5706,  4.4620,  4.0414,  1.1512, -5.6110,  4.9799, -2.3589,
+                0.2374,  1.3166,  2.4747,  5.3375,  1.8736, -7.0334,  4.5650,  7.2630,
+                5.3717, -1.4470,  8.2825,  0.0850,  0.2813, -9.2511, -1.0556, -7.4180,
+                1.3470,  3.8482,  6.9497, -5.4600, -2.3341,  8.6016
+         ]
+        )
+        # fmt: on
+        self.assertTrue(torch.allclose(logits[0][0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
+
+        # fmt: off
+        EXPECTED_GENERATION = torch.tensor(
+            [
+                -2.7021, -4.1197,  1.4249,  1.0848,  2.6796, -1.2697, -4.7105,  0.6435,
+                -1.2619, -0.2921, -0.1743, -1.0284, -4.8171, -1.8063, -0.8316,  3.6740,
+                -0.2714,  0.2152, -0.2616, -1.6536,  0.5402,  0.0383,  0.0252,  1.2742,
+                0.5945, -6.7489, -1.9650,  3.0372,  1.9253,  1.6359
+            ]
+        )
+        # fmt: on
+        
+        head_logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
+        self.assertTrue(torch.allclose(head_logits[0, 0, :30].cpu(), EXPECTED_GENERATION, atol=1e-4))
+
 
-        from transformers import GPT2Tokenizer, WhisperFeatureExtractor, set_seed
+    def test_small_logits_librispeech(self):
 
         torch_device = "cpu"
         set_seed(0)
-        model = WhisperModel.from_pretrained("whisper/tiny")
+        model = WhisperModel.from_pretrained("/home/arthur_huggingface_co/transformers/whisper-final/small.en")
         model.to(torch_device)
 
         # processor = self.default_processor
 
-        input_speech = self._load_datasamples(1)
+        input_speech = self._load_datasamples(1)[-1]
 
         feaure_extractor = WhisperFeatureExtractor()
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
         tokenizer.pad_token = 0
 
         processor = WhisperProcessor(feaure_extractor, tokenizer)
 
         input_features = processor(
-            audio=input_speech, text="This part of the speech", return_tensors="pt"
+            audio=input_speech, return_tensors="pt"
         ).input_features.to(torch_device)
-        labels = processor(audio=input_speech, text="This part of the speech", return_tensors="pt").labels.to(
-            torch_device
-        )
+
         with torch.no_grad():
             logits = model(
                 input_features,
-                decoder_input_ids=labels,
+                decoder_input_ids=torch.tensor([model.config.decoder_start_token_id]),
                 output_hidden_states=False,
                 output_attentions=False,
                 use_cache=False,
@@ -770,10 +832,10 @@ def test_tiny_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                8.8958, 4.0423, 9.8841, 9.8493, 10.0628, 4.8472, 9.0100, 5.7364,
-                5.9165, 7.6322, 3.1579, 10.7269, 6.9586, 10.1852, 5.4714, 8.2995,
-                4.7507, 6.6723, 7.2764, 7.1831, 7.0388, 7.2191, 6.2364, 6.2117,
-                5.8797, 2.8099, 6.8319, 5.7094, 0.6999, 6.8444
+                -3.6748, -5.8862, -6.7038, -8.0681, -5.9222, -7.5940, -4.6479, -6.2475,
+                -3.6708, -4.0578, -6.5905, -4.0916, -6.9554, -3.9227, -5.4782, -5.1931,
+                -5.8127, -6.6239, -6.4373, -6.8171, -7.0080, -7.8014, -6.8085, -8.3919,
+                -6.5980, -5.6730, -4.6434, -7.4606, -8.5103, -5.4635
             ]
         )
         # fmt: on
@@ -830,7 +892,7 @@ def test_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
-        model = WhisperForConditionalGeneration.from_pretrained("whisper/tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("whisper-final/tiny")
         model.to(torch_device)
 
         input_speech = self._load_datasamples(1)
@@ -838,9 +900,28 @@ def test_generation(self):
 
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
-        tokenizer = WhisperTokenizer.from_pretrained("whisper/tiny-multy")
+        tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
         generated_ids = model.generate(input_features, num_beams=5)
         transcript = tokenizer.batch_decode(generated_ids)
 
-        EXPECTED_TRANSCRIPT = "Nor is Mr. Quilters manner less interesting than his matter."
+        EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    def test_large_generation(self):
+
+        torch_device = "cpu"
+        set_seed(0)
+        model = WhisperForConditionalGeneration.from_pretrained("whisper-final/large")
+        model.to(torch_device)
+
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = WhisperFeatureExtractor()
+
+        input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
+        generated_ids = model.generate(input_features, do_sample = False)
+        transcript = tokenizer.batch_decode(generated_ids)
+
+        EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)

From a27eb00d3a852fb55c17dc81f2d7c666ad67dc39 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 12:00:34 +0000
Subject: [PATCH 034/156] update tests and code

---
 docs/source/en/_toctree.yml                   | 10 ++--
 .../whisper/convert_openai_whisper_to_tfms.py | 51 ++++++++++++++++++-
 .../whisper/feature_extraction_whisper.py     |  2 +
 .../models/whisper/modeling_whisper.py        | 10 +++-
 tests/models/whisper/test_modeling_whisper.py | 35 ++++++-------
 .../whisper/test_tokenization_whisper.py      |  8 +--
 6 files changed, 86 insertions(+), 30 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index ea44c9730c957..2f85a5a8beee5 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,7 +42,8 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - sections:
+    - isExpanded: false
+      sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +59,6 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
-      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -472,8 +472,6 @@
         title: Speech Encoder Decoder Models
       - local: model_doc/trocr
         title: TrOCR
-      - local: model_doc/whisper
-        title: Whisper
       - local: model_doc/vilt
         title: ViLT
       - local: model_doc/vision-encoder-decoder
@@ -482,6 +480,8 @@
         title: Vision Text Dual Encoder
       - local: model_doc/visual_bert
         title: VisualBERT
+      - local: model_doc/whisper
+        title: Whisper
       - local: model_doc/xclip
         title: X-CLIP
       title: Multimodal models
@@ -507,4 +507,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 67edb86a92c4c..8d3e38810232e 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -23,7 +23,7 @@
 from torch import nn
 from tqdm import tqdm
 
-from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperModel
+from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperModel, WhisperFeatureExtractor, WhisperProcessor
 
 
 def remove_ignore_keys_(state_dict):
@@ -181,6 +181,44 @@ def convert_every_model(save_dir="whisper"):
     width = [384, 512, 768, 1024, 1280]
     heads = [6, 8, 12, 16, 20]
     name = ["tiny", "base", "small", "medium", "large"]
+    for l, w, h, n in zip(layers, width, heads, name):
+        
+        config = WhisperConfig(
+            vocab_size=51865,
+            encoder_layers=l,
+            encoder_attention_heads=h,
+            decoder_attention_heads=h,
+            decoder_layers=l,
+            d_model=w,
+            decoder_ffn_dim=4*w,
+            encoder_ffn_dim=4*w,
+        )
+        model = WhisperModel(config)
+
+        model_bytes, _ = _download(_MODELS[n], "original-weights")
+        with io.BytesIO(model_bytes) as fp:
+            original = torch.load(fp, map_location="cpu")["model_state_dict"]
+
+        # original = torch.load(f"/home/arthur_huggingface_co/whisper/tiny.pt")
+        new = rename_keys(original.copy())
+
+        missing, unexpected = model.load_state_dict(new, strict=False)
+        if missing == []:
+            print("succesfully loaded")
+            model.save_pretrained(f"{save_dir}/{n}")
+        
+        checkpoint_path = f"openai/whisper-{n}"
+        model.push_to_hub(checkpoint_path, use_auth_token = "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")
+        from transformers import WhisperTokenizer
+        tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
+        # tokenizer.push_to_hub(checkpoint_path, use_auth_token = "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")
+
+        feature_extractor = WhisperFeatureExtractor()
+        processor = WhisperProcessor(feature_extractor, tokenizer)
+        processor.push_to_hub(checkpoint_path, use_auth_token = "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")
+
+    # for en only, decoder input_ids = 50257
+
     for l, w, h, n in zip(layers, width, heads, name):
         n += ".en"
         config = WhisperConfig(
@@ -206,7 +244,16 @@ def convert_every_model(save_dir="whisper"):
         if missing == []:
             print("succesfully loaded")
             model.save_pretrained(f"{save_dir}/{n}")
-
+        
+        checkpoint_path = f"openai/whisper-{n}"
+        model.push_to_hub(checkpoint_path, use_auth_token = "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")
+        from transformers import WhisperTokenizer
+        tokenizer = WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper-any.en")
+        # tokenizer.push_to_hub(checkpoint_path, use_auth_token = "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")
+
+        feature_extractor = WhisperFeatureExtractor()
+        processor = WhisperProcessor(feature_extractor, tokenizer)
+        processor.push_to_hub(checkpoint_path, use_auth_token = "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")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 2b17eea6508ec..495b81a416ac8 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -231,6 +231,8 @@ def __call__(
         if not is_batched:
             raw_speech = [raw_speech]
 
+
+        # TODO switch order, should pad first, mel after
         # extract fbank features
         features = [self._extract_fbank_features(waveform).permute(1, 0) for waveform in raw_speech]
 
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ebfa78c8f8269..f74e34d0f671f 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -81,7 +81,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
 
 
 # Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None ):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
@@ -751,6 +751,7 @@ def __init__(self, config: WhisperConfig):
         self.layerdrop = config.decoder_layerdrop
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_target_positions
+        self.max_source_positions = config.max_source_positions
         self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
@@ -1030,6 +1031,9 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
+    def _get_feat_extract_output_lengths(self, input:int):
+        return (input-1//2) +1
+
     @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1098,7 +1102,9 @@ def forward(
 
         # downsample encoder attention mask
         if attention_mask is not None:
-            encoder_attention_mask =  attention_mask[  : , encoder_outputs[0].shape[1]]
+            encoder_attention_mask = self._get_feature_vector_attention_mask(
+                encoder_outputs[0].shape[1], attention_mask
+            )
         else:
             encoder_attention_mask = None
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index b53b87367929c..1530be137088b 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -101,13 +101,13 @@ def __init__(
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=20,
-        max_source_positions=15,
+        max_source_positions=30,
         max_target_positions=4,
-        bos_token_id=50257,
-        eos_token_id=50257,
+        bos_token_id=99,
+        eos_token_id=99,
         pad_token_id=0,
         num_mel_bins=80,
-        decoder_start_token_id=(50258, 50259, 50359),
+        decoder_start_token_id=[96,98],
         num_conv_layers=2,
     ):
         self.parent = parent
@@ -135,8 +135,8 @@ def __init__(
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
-        attention_mask = torch.ones([self.batch_size, self.seq_length], dtype=torch.long, device=torch_device)
-        decoder_input_ids = torch.tensor(self.decoder_start_token_id)
+        attention_mask = torch.ones([self.batch_size, self.max_source_positions], dtype=torch.long, device=torch_device)
+        decoder_input_ids = torch.tensor(self.batch_size * [self.decoder_start_token_id], device=torch_device)
 
         config = self.get_config()
         inputs_dict = prepare_whisper_inputs_dict(
@@ -164,6 +164,7 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.decoder_start_token_id
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -630,7 +631,7 @@ def _get_encoder_outputs(
             num_interleave, dim=0
         )
         input_ids = input_ids[:, :, 0]
-        input_ids = torch.zeros_like(input_ids[:, :1], dtype=torch.long) + model._get_decoder_start_token_id()
+        input_ids = torch.zeros_like(input_ids[:, :1], dtype=torch.long) + torch.tensor([model._get_decoder_start_token_id()], device=input_ids.device)
         attention_mask = None
         return encoder_outputs, input_ids, attention_mask
 
@@ -755,7 +756,7 @@ def _load_datasamples(self, num_samples):
     def test_inference_no_head(self):
         torch_device = "cpu"
         set_seed(0)
-        model = WhisperModel.from_pretrained("whisper/tiny")
+        model = WhisperModel.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
         input_speech = self._load_datasamples(1)[-1]
         feature_extractor = WhisperFeatureExtractor()
@@ -793,7 +794,7 @@ def test_inference_no_head(self):
         )
         # fmt: on
         
-        head_logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
+        head_logits = logits[0] @ model.decoder.embed_tokens.weight.T
         self.assertTrue(torch.allclose(head_logits[0, 0, :30].cpu(), EXPECTED_GENERATION, atol=1e-4))
 
 
@@ -801,7 +802,7 @@ def test_small_logits_librispeech(self):
 
         torch_device = "cpu"
         set_seed(0)
-        model = WhisperModel.from_pretrained("/home/arthur_huggingface_co/transformers/whisper-final/small.en")
+        model = WhisperModel.from_pretrained("openai/whisper-small.en")
         model.to(torch_device)
 
         # processor = self.default_processor
@@ -844,8 +845,8 @@ def test_small_logits_librispeech(self):
     def test_large_logits_librispeech(self):
 
         torch_device = "cpu"
+        model = WhisperModel.from_pretrained("openai/whisper-large")
         set_seed(0)
-        model = WhisperModel.from_pretrained("whisper/large")
         model.to(torch_device)
 
         # processor = self.default_processor
@@ -853,7 +854,7 @@ def test_large_logits_librispeech(self):
         input_speech = self._load_datasamples(1)
 
         feaure_extractor = WhisperFeatureExtractor()
-        tokenizer = WhisperTokenizer.from_pretrained("gpt2")
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
         tokenizer.pad_token = 0
 
         processor = WhisperProcessor(feaure_extractor, tokenizer)
@@ -892,7 +893,7 @@ def test_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
-        model = WhisperForConditionalGeneration.from_pretrained("whisper-final/tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
 
         input_speech = self._load_datasamples(1)
@@ -900,9 +901,9 @@ def test_generation(self):
 
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
-        tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
         generated_ids = model.generate(input_features, num_beams=5)
-        transcript = tokenizer.batch_decode(generated_ids)
+        transcript = tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -911,7 +912,7 @@ def test_large_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
-        model = WhisperForConditionalGeneration.from_pretrained("whisper-final/large")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
         model.to(torch_device)
 
         input_speech = self._load_datasamples(1)
@@ -919,7 +920,7 @@ def test_large_generation(self):
 
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
-        tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
         generated_ids = model.generate(input_features, do_sample = False)
         transcript = tokenizer.batch_decode(generated_ids)
 
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index b9b88dccfc6ee..095b974a0c945 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -100,7 +100,7 @@ def test_tokenizer_integration(self):
 
 @require_sentencepiece
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
-    checkpoint_name = "/home/arthur_huggingface_co/transformers/whisper/tiny"
+    checkpoint_name = "ArthurZ/whisper-small.eng"
 
     transcript = (
         "'<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Nor is Mr. Quilters manner less interesting"
@@ -116,9 +116,9 @@ def setUpClass(cls):
     def test_tokenizer_equivalence(self):
         text = "다람쥐 헌 쳇바퀴에 타고파"
         multilingual_tokenizer = WhisperTokenizer.from_pretrained(
-            "/home/arthur_huggingface_co/transformers/whisper/tiny-multy"
+            "ArthurZ/whisper-small"
         )
-        gpt2_tokenizer = WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny")
+        gpt2_tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.eng")
 
         text = "다람쥐 헌 쳇바퀴에 타고파"
         gpt2_tokens = gpt2_tokenizer.encode(text)
@@ -189,7 +189,7 @@ def test_tokenizer_equivalence(self):
 
     def test_tokenizer_special(self):
         multilingual_tokenizer = WhisperTokenizer.from_pretrained(
-            "/home/arthur_huggingface_co/transformers/whisper/tiny-multy"
+            "ArthurZ/whisper-small.eng"
         )
         text = "[Denis] Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
         multilingual_tokens = multilingual_tokenizer.encode(text)

From ef6e08e1216fe77f2b0e69b3cff49bb5b007f089 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 16:25:42 +0000
Subject: [PATCH 035/156] fix feature extractor

---
 .../whisper/feature_extraction_whisper.py     | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 495b81a416ac8..3cb0897e9fcfc 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -229,30 +229,31 @@ def __call__(
 
         # always return batch
         if not is_batched:
-            raw_speech = [raw_speech]
+            # raw_speech = [raw_speech]
+            pass
 
-
-        # TODO switch order, should pad first, mel after
-        # extract fbank features
-        features = [self._extract_fbank_features(waveform).permute(1, 0) for waveform in raw_speech]
+        batched_speech = BatchFeature({"input_features": [np.asarray(raw_speech).T]})
 
         # convert into correct format for padding
-        encoded_inputs = BatchFeature({"input_features": features})
+        
 
         padded_inputs = self.pad(
-            encoded_inputs,
+            batched_speech,
             padding=padding,
-            max_length=max_length if max_length else self.nb_max_frame,
+            max_length=max_length if max_length else self.n_samples+1,
             truncation=truncation,
             pad_to_multiple_of=pad_to_multiple_of,
             return_attention_mask=return_attention_mask,
             **kwargs,
         )
-        padded_inputs["input_features"] = padded_inputs["input_features"].permute(0, 2, 1)
         # make sure list is in array format
-        input_features = padded_inputs.get("input_features")
+        input_features = padded_inputs.get("input_features").transpose(0, 2,1)
+        input_features = [self._extract_fbank_features(waveform[0]) for waveform in input_features]
+        
         if isinstance(input_features[0], list):
             padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+        else : 
+            padded_inputs["input_features"] = input_features
 
         attention_mask = padded_inputs.get("attention_mask")
         if attention_mask is not None:

From fc5ce23bb52763f50d70e80c37eae43ea4442f4c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 16:26:37 +0000
Subject: [PATCH 036/156] fix config tests common

---
 src/transformers/models/whisper/configuration_whisper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index ba6102dea192a..c73b54bbc4fcc 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -189,5 +189,6 @@ def __init__(
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
+            tie_word_embeddings = tie_word_embeddings,
             **kwargs,
         )

From 655e460b04f953dd72eb88667244a3aed30ced14 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 16:27:13 +0000
Subject: [PATCH 037/156] update code to fix tests

---
 .../models/whisper/modeling_whisper.py        | 17 +++---
 tests/models/whisper/test_modeling_whisper.py | 56 ++++++++++++++-----
 2 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index f74e34d0f671f..c988ed0ad9579 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -460,6 +460,9 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (WhisperDecoder, WhisperEncoder)):
             module.gradient_checkpointing = value
     
+    def _get_feat_extract_output_lengths(self, input:int):
+        return (input-1)//2 +1
+
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
         # convert it to 2D if thats the case
@@ -775,12 +778,15 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         combined_attention_mask = None
-        if input_shape[-1] > 1:
+    
+        if input_shape[-1] > 1  :
             combined_attention_mask = _make_causal_mask(
                 input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
             ).to(inputs_embeds.device)
 
         if attention_mask is not None:
+            if attention_mask.shape[-1] > input_shape[-1] > 1  :
+                attention_mask = attention_mask[:,:input_shape[-1]]
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
             combined_attention_mask = (
@@ -893,6 +899,7 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
@@ -1031,8 +1038,7 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
-    def _get_feat_extract_output_lengths(self, input:int):
-        return (input-1//2) +1
+
 
     @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@@ -1148,13 +1154,10 @@ class WhisperForConditionalGeneration(WhisperPreTrainedModel):
     _keys_to_ignore_on_load_missing = [
         r"encoder.version",
         r"decoder.version",
-        r"model.encoder.embed_positions.weight",
-        r"model.decoder.embed_positions.weight",
         r"proj_out.weight",
     ]
     _keys_to_ignore_on_save = [
-        r"model.encoder.embed_positions.weight",
-        r"model.decoder.embed_positions.weight",
+        r"proj_out.weight",
     ]
 
     def __init__(self, config: WhisperConfig):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 1530be137088b..e31e56e12212f 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -31,6 +31,7 @@
     torch_device,
 )
 from transformers.utils import cached_property
+from transformers.generation_logits_process import SuppressBlank, SuppressTokens, LogitsProcessorList
 
 from ...generation.test_generation_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -102,13 +103,13 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=20,
         max_source_positions=30,
-        max_target_positions=4,
-        bos_token_id=99,
-        eos_token_id=99,
+        max_target_positions=40,
+        bos_token_id=98,
+        eos_token_id=98,
         pad_token_id=0,
         num_mel_bins=80,
-        decoder_start_token_id=[96,98],
-        num_conv_layers=2,
+        decoder_start_token_id=[85,87],
+        num_conv_layers=1,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -164,6 +165,8 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
+            decoder_ffn_dim = self.hidden_size,
+            encoder_ffn_dim = self.hidden_size,
             decoder_start_token_id=self.decoder_start_token_id
         )
 
@@ -310,9 +313,9 @@ def _get_input_ids_and_config(self):
         input_ids = inputs_dict[self.input_name]
 
         # cut to half length & take max batch_size 3
-        max_batch_size = 2
+        max_batch_size = 3
         sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:max_batch_size, : , :sequence_length]
+        input_ids = input_ids[:max_batch_size, : , :]
 
         # generate max 3 tokens
         max_length = input_ids.shape[-1] + 3
@@ -337,6 +340,7 @@ def test_training_gradient_checkpointing(self):
 
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
+        config.max_target_positions = 400
         input_features = input_dict["input_features"]
         attention_mask = input_dict["attention_mask"]
         model = WhisperForConditionalGeneration(config).eval().to(torch_device)
@@ -402,7 +406,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
                 self.assertIsInstance(hidden_states, (list, tuple))
                 self.assertEqual(len(hidden_states), expected_num_layers)
                 seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 2)
 
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
@@ -426,9 +430,9 @@ def test_attention_outputs(self):
         config.return_dict = True
 
         seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 2)
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", 2)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
         for model_class in self.all_model_classes:
@@ -636,7 +640,7 @@ def _get_encoder_outputs(
         return encoder_outputs, input_ids, attention_mask
 
     def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
-        batch_size, seq_length = input_ids.shape[:2]
+        batch_size, mel, seq_length = input_ids.shape
         subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
         num_sequences_in_output = batch_size * num_return_sequences
         gen_len = (
@@ -889,12 +893,32 @@ def test_large_logits_librispeech(self):
 
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
+    def test_generation_en_only(self):
+
+        torch_device = "cpu"
+        set_seed(0)
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model.to(torch_device)
+
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = WhisperFeatureExtractor()
+
+        input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
+        generated_ids = model.generate(input_features, num_beams=5)
+        transcript = tokenizer.batch_decode(generated_ids)[0]
+
+        EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
     def test_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
+        model.config.decoder_input_ids = [50258, 50259, 50359, 50363]
 
         input_speech = self._load_datasamples(1)
         feaure_extractor = WhisperFeatureExtractor()
@@ -902,7 +926,7 @@ def test_generation(self):
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
-        generated_ids = model.generate(input_features, num_beams=5)
+        generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids = torch.tensor([ [50258, 50259, 50359, 50363]]))
         transcript = tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"
@@ -920,8 +944,12 @@ def test_large_generation(self):
 
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
-        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
-        generated_ids = model.generate(input_features, do_sample = False)
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small.en")
+
+        logits_processor = LogitsProcessorList([SuppressBlank(tokenizer.encode(" "),tokenizer.eos_token_id ), SuppressTokens(tokenizer._get_suppress_tokens("-1"))]    )
+
+
+        generated_ids = model.generate(input_features, do_sample = False, logits_processor=logits_processor, decoder_start_token_id=torch.tensor([ [50257, 50362]] ), attention_mask=None, decoder_attention_mask =None  )
         transcript = tokenizer.batch_decode(generated_ids)
 
         EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"

From d7dcfbd23987fa65533f0a932adb0624d6872a57 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 17:05:49 +0000
Subject: [PATCH 038/156] fix feature exctractor

---
 .../models/whisper/feature_extraction_whisper.py  | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 3cb0897e9fcfc..8369ed2ae4008 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -235,29 +235,28 @@ def __call__(
         batched_speech = BatchFeature({"input_features": [np.asarray(raw_speech).T]})
 
         # convert into correct format for padding
-        
 
         padded_inputs = self.pad(
             batched_speech,
             padding=padding,
-            max_length=max_length if max_length else self.n_samples+1,
+            max_length=max_length if max_length else self.n_samples,
             truncation=truncation,
             pad_to_multiple_of=pad_to_multiple_of,
             return_attention_mask=return_attention_mask,
             **kwargs,
         )
         # make sure list is in array format
-        input_features = padded_inputs.get("input_features").transpose(0, 2,1)
+        input_features = padded_inputs.get("input_features").transpose(0, 2, 1)
         input_features = [self._extract_fbank_features(waveform[0]) for waveform in input_features]
-        
-        if isinstance(input_features[0], list):
+
+        if isinstance(input_features[0], torch.Tensor) or isinstance(input_features[0],List) :
             padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
-        else : 
+        else:
             padded_inputs["input_features"] = input_features
 
-        attention_mask = padded_inputs.get("attention_mask")
+        attention_mask = np.asarray(padded_inputs.get("attention_mask"))[:,:self.nb_max_frame]
         if attention_mask is not None:
-            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+            padded_inputs["attention_mask"] = [attention_mask]
 
         if return_tensors is not None:
             padded_inputs = padded_inputs.convert_to_tensors(return_tensors)

From c64e8a6bdd78a81fa21a364abf434dd6bc1b9def Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 17:30:13 +0000
Subject: [PATCH 039/156] nit feature extraction

---
 .../models/whisper/feature_extraction_whisper.py         | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 8369ed2ae4008..01c49de281b29 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -77,7 +77,7 @@ def __init__(
         self.return_attention_mask = True
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frame = self.n_samples // hop_length
-
+        self.sampling_rate = sampling_rate
         self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=num_mel_bins)
 
     def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
@@ -229,9 +229,8 @@ def __call__(
 
         # always return batch
         if not is_batched:
-            # raw_speech = [raw_speech]
-            pass
-
+            raw_speech = [raw_speech]
+            
         batched_speech = BatchFeature({"input_features": [np.asarray(raw_speech).T]})
 
         # convert into correct format for padding
@@ -247,7 +246,7 @@ def __call__(
         )
         # make sure list is in array format
         input_features = padded_inputs.get("input_features").transpose(0, 2, 1)
-        input_features = [self._extract_fbank_features(waveform[0]) for waveform in input_features]
+        input_features = [self._extract_fbank_features(waveform) for waveform in input_features[0]]
 
         if isinstance(input_features[0], torch.Tensor) or isinstance(input_features[0],List) :
             padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]

From edce53e706f96b2b6d0a939ea7bf72c93ea99d7d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 17:59:16 +0000
Subject: [PATCH 040/156] update test for new feature extractor

---
 tests/models/whisper/test_modeling_whisper.py | 92 ++++++++++++-------
 .../whisper/test_tokenization_whisper.py      |  8 +-
 2 files changed, 59 insertions(+), 41 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index e31e56e12212f..bde1f966d23ca 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -21,6 +21,7 @@
 import unittest
 
 from transformers import WhisperConfig
+from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
 from transformers.testing_utils import (
     is_torch_available,
     require_sentencepiece,
@@ -31,7 +32,6 @@
     torch_device,
 )
 from transformers.utils import cached_property
-from transformers.generation_logits_process import SuppressBlank, SuppressTokens, LogitsProcessorList
 
 from ...generation.test_generation_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -108,7 +108,7 @@ def __init__(
         eos_token_id=98,
         pad_token_id=0,
         num_mel_bins=80,
-        decoder_start_token_id=[85,87],
+        decoder_start_token_id=[85, 87],
         num_conv_layers=1,
     ):
         self.parent = parent
@@ -136,7 +136,9 @@ def __init__(
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
-        attention_mask = torch.ones([self.batch_size, self.max_source_positions], dtype=torch.long, device=torch_device)
+        attention_mask = torch.ones(
+            [self.batch_size, self.max_source_positions], dtype=torch.long, device=torch_device
+        )
         decoder_input_ids = torch.tensor(self.batch_size * [self.decoder_start_token_id], device=torch_device)
 
         config = self.get_config()
@@ -165,17 +167,15 @@ def get_config(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.bos_token_id,
             pad_token_id=self.pad_token_id,
-            decoder_ffn_dim = self.hidden_size,
-            encoder_ffn_dim = self.hidden_size,
-            decoder_start_token_id=self.decoder_start_token_id
+            decoder_ffn_dim=self.hidden_size,
+            encoder_ffn_dim=self.hidden_size,
+            decoder_start_token_id=self.decoder_start_token_id,
         )
 
     def prepare_config_and_inputs_for_common(self):
         config, inputs_dict = self.prepare_config_and_inputs()
         return config, inputs_dict
 
-
-
     def get_subsampled_output_lengths(self, input_lengths):
         """
         Computes the output length of the convolutional layers
@@ -315,7 +315,7 @@ def _get_input_ids_and_config(self):
         # cut to half length & take max batch_size 3
         max_batch_size = 3
         sequence_length = input_ids.shape[-1] // 2
-        input_ids = input_ids[:max_batch_size, : , :]
+        input_ids = input_ids[:max_batch_size, :, :]
 
         # generate max 3 tokens
         max_length = input_ids.shape[-1] + 3
@@ -405,7 +405,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
                 self.assertIsInstance(hidden_states, (list, tuple))
                 self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
+
                 decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 2)
 
                 self.assertListEqual(
@@ -635,7 +635,9 @@ def _get_encoder_outputs(
             num_interleave, dim=0
         )
         input_ids = input_ids[:, :, 0]
-        input_ids = torch.zeros_like(input_ids[:, :1], dtype=torch.long) + torch.tensor([model._get_decoder_start_token_id()], device=input_ids.device)
+        input_ids = torch.zeros_like(input_ids[:, :1], dtype=torch.long) + torch.tensor(
+            [model._get_decoder_start_token_id()], device=input_ids.device
+        )
         attention_mask = None
         return encoder_outputs, input_ids, attention_mask
 
@@ -762,9 +764,10 @@ def test_inference_no_head(self):
         set_seed(0)
         model = WhisperModel.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
-        input_speech = self._load_datasamples(1)[-1]
+        input_speech = self._load_datasamples(1)
         feature_extractor = WhisperFeatureExtractor()
-        input_features = feature_extractor(input_speech).input_features
+        input_features = feature_extractor(2*input_speech, return_tensors = "pt").input_features
+
         with torch.no_grad():
             logits = model(
                 input_features,
@@ -778,11 +781,11 @@ def test_inference_no_head(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                3.0952, -7.5706,  4.4620,  4.0414,  1.1512, -5.6110,  4.9799, -2.3589,
-                0.2374,  1.3166,  2.4747,  5.3375,  1.8736, -7.0334,  4.5650,  7.2630,
-                5.3717, -1.4470,  8.2825,  0.0850,  0.2813, -9.2511, -1.0556, -7.4180,
-                1.3470,  3.8482,  6.9497, -5.4600, -2.3341,  8.6016
-         ]
+                2.9892, -6.7607,  5.7348,  3.6095,  0.2152, -5.7321,  4.8855, -1.6407,
+                0.2823, -1.5718, 10.4269,  3.4427,  0.0219, -8.0612,  3.4784,  8.4246,
+                4.0575, -2.2864, 11.1084,  0.9963,  0.9884, -8.5154, -3.5469, -9.3714,
+                0.9786,  3.5435,  7.4850, -5.2579, -1.4366, 10.4841
+            ]
         )
         # fmt: on
         self.assertTrue(torch.allclose(logits[0][0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
@@ -790,18 +793,17 @@ def test_inference_no_head(self):
         # fmt: off
         EXPECTED_GENERATION = torch.tensor(
             [
-                -2.7021, -4.1197,  1.4249,  1.0848,  2.6796, -1.2697, -4.7105,  0.6435,
-                -1.2619, -0.2921, -0.1743, -1.0284, -4.8171, -1.8063, -0.8316,  3.6740,
-                -0.2714,  0.2152, -0.2616, -1.6536,  0.5402,  0.0383,  0.0252,  1.2742,
-                0.5945, -6.7489, -1.9650,  3.0372,  1.9253,  1.6359
+                -1.4651, -2.6944,  2.7821,  2.3793,  4.0738,  0.0188, -3.3204,  1.9836,
+                0.0520,  0.7095,  1.1063,  0.2952, -3.6786, -0.5249,  0.3105,  4.7691,
+                1.1562,  1.3046,  0.5810, -0.3624,  1.7006,  1.3424,  0.9817,  2.1958,
+                1.8775, -5.7046, -0.7679,  4.0113,  2.6848,  2.8609
             ]
         )
         # fmt: on
-        
+
         head_logits = logits[0] @ model.decoder.embed_tokens.weight.T
         self.assertTrue(torch.allclose(head_logits[0, 0, :30].cpu(), EXPECTED_GENERATION, atol=1e-4))
 
-
     def test_small_logits_librispeech(self):
 
         torch_device = "cpu"
@@ -811,7 +813,7 @@ def test_small_logits_librispeech(self):
 
         # processor = self.default_processor
 
-        input_speech = self._load_datasamples(1)[-1]
+        input_speech = self._load_datasamples(1)
 
         feaure_extractor = WhisperFeatureExtractor()
         tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
@@ -819,9 +821,7 @@ def test_small_logits_librispeech(self):
 
         processor = WhisperProcessor(feaure_extractor, tokenizer)
 
-        input_features = processor(
-            audio=input_speech, return_tensors="pt"
-        ).input_features.to(torch_device)
+        input_features = processor(audio=input_speech, return_tensors="pt").input_features.to(torch_device)
 
         with torch.no_grad():
             logits = model(
@@ -909,7 +909,10 @@ def test_generation_en_only(self):
         generated_ids = model.generate(input_features, num_beams=5)
         transcript = tokenizer.batch_decode(generated_ids)[0]
 
-        EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"
+        EXPECTED_TRANSCRIPT = (
+            "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            " classes and we are glad"
+        )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
     def test_generation(self):
@@ -926,10 +929,15 @@ def test_generation(self):
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
-        generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids = torch.tensor([ [50258, 50259, 50359, 50363]]))
+        generated_ids = model.generate(
+            input_features, num_beams=5, decoder_input_ids=torch.tensor([[50258, 50363]])
+        )
         transcript = tokenizer.batch_decode(generated_ids)[0]
 
-        EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"
+        EXPECTED_TRANSCRIPT = (
+            "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            " classes and we are glad"
+        )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
     def test_large_generation(self):
@@ -946,11 +954,25 @@ def test_large_generation(self):
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small.en")
 
-        logits_processor = LogitsProcessorList([SuppressBlank(tokenizer.encode(" "),tokenizer.eos_token_id ), SuppressTokens(tokenizer._get_suppress_tokens("-1"))]    )
-
+        logits_processor = LogitsProcessorList(
+            [
+                SuppressBlank(tokenizer.encode(" "), tokenizer.eos_token_id),
+                SuppressTokens(tokenizer._get_suppress_tokens("-1")),
+            ]
+        )
 
-        generated_ids = model.generate(input_features, do_sample = False, logits_processor=logits_processor, decoder_start_token_id=torch.tensor([ [50257, 50362]] ), attention_mask=None, decoder_attention_mask =None  )
+        generated_ids = model.generate(
+            input_features,
+            do_sample=False,
+            logits_processor=logits_processor,
+            decoder_input_ids=torch.tensor([[50257, 50362]]),
+            attention_mask=None,
+            decoder_attention_mask=None,
+        )
         transcript = tokenizer.batch_decode(generated_ids)
 
-        EXPECTED_TRANSCRIPT = "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle classes and we are glad"
+        EXPECTED_TRANSCRIPT = (
+            "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            " classes and we are glad"
+        )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 095b974a0c945..19264f39f751c 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -115,9 +115,7 @@ def setUpClass(cls):
 
     def test_tokenizer_equivalence(self):
         text = "다람쥐 헌 쳇바퀴에 타고파"
-        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
-            "ArthurZ/whisper-small"
-        )
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small")
         gpt2_tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.eng")
 
         text = "다람쥐 헌 쳇바퀴에 타고파"
@@ -188,9 +186,7 @@ def test_tokenizer_equivalence(self):
         self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
 
     def test_tokenizer_special(self):
-        multilingual_tokenizer = WhisperTokenizer.from_pretrained(
-            "ArthurZ/whisper-small.eng"
-        )
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.eng")
         text = "[Denis] Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
         multilingual_tokens = multilingual_tokenizer.encode(text)
 

From e661aef6f51e1a94658204e99ce99fa2c5ecfacc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 17:59:28 +0000
Subject: [PATCH 041/156] style

---
 .../models/whisper/configuration_whisper.py   |  2 +-
 .../whisper/convert_openai_whisper_to_tfms.py | 36 ++++++++-----
 .../models/whisper/modeling_whisper.py        | 53 +++++++++----------
 3 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index c73b54bbc4fcc..319115fb1f5ad 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -189,6 +189,6 @@ def __init__(
             eos_token_id=eos_token_id,
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
-            tie_word_embeddings = tie_word_embeddings,
+            tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 8d3e38810232e..8e9d0b5ce6bfc 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -23,7 +23,14 @@
 from torch import nn
 from tqdm import tqdm
 
-from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperModel, WhisperFeatureExtractor, WhisperProcessor
+from transformers import (
+    WhisperConfig,
+    WhisperFeatureExtractor,
+    WhisperForConditionalGeneration,
+    WhisperModel,
+    WhisperProcessor,
+    WhisperTokenizer,
+)
 
 
 def remove_ignore_keys_(state_dict):
@@ -182,7 +189,7 @@ def convert_every_model(save_dir="whisper"):
     heads = [6, 8, 12, 16, 20]
     name = ["tiny", "base", "small", "medium", "large"]
     for l, w, h, n in zip(layers, width, heads, name):
-        
+
         config = WhisperConfig(
             vocab_size=51865,
             encoder_layers=l,
@@ -190,8 +197,8 @@ def convert_every_model(save_dir="whisper"):
             decoder_attention_heads=h,
             decoder_layers=l,
             d_model=w,
-            decoder_ffn_dim=4*w,
-            encoder_ffn_dim=4*w,
+            decoder_ffn_dim=4 * w,
+            encoder_ffn_dim=4 * w,
         )
         model = WhisperModel(config)
 
@@ -206,16 +213,16 @@ def convert_every_model(save_dir="whisper"):
         if missing == []:
             print("succesfully loaded")
             model.save_pretrained(f"{save_dir}/{n}")
-        
+
         checkpoint_path = f"openai/whisper-{n}"
-        model.push_to_hub(checkpoint_path, use_auth_token = "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")
-        from transformers import WhisperTokenizer
+        model.push_to_hub(checkpoint_path, use_auth_token="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")
+
         tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
         # tokenizer.push_to_hub(checkpoint_path, use_auth_token = "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")
 
         feature_extractor = WhisperFeatureExtractor()
         processor = WhisperProcessor(feature_extractor, tokenizer)
-        processor.push_to_hub(checkpoint_path, use_auth_token = "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")
+        processor.push_to_hub(checkpoint_path, use_auth_token="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")
 
     # for en only, decoder input_ids = 50257
 
@@ -228,8 +235,8 @@ def convert_every_model(save_dir="whisper"):
             decoder_attention_heads=h,
             decoder_layers=l,
             d_model=w,
-            decoder_ffn_dim=4*w,
-            encoder_ffn_dim=4*w,
+            decoder_ffn_dim=4 * w,
+            encoder_ffn_dim=4 * w,
         )
         model = WhisperModel(config)
 
@@ -244,16 +251,17 @@ def convert_every_model(save_dir="whisper"):
         if missing == []:
             print("succesfully loaded")
             model.save_pretrained(f"{save_dir}/{n}")
-        
+
         checkpoint_path = f"openai/whisper-{n}"
-        model.push_to_hub(checkpoint_path, use_auth_token = "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")
-        from transformers import WhisperTokenizer
+        model.push_to_hub(checkpoint_path, use_auth_token="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")
+
         tokenizer = WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper-any.en")
         # tokenizer.push_to_hub(checkpoint_path, use_auth_token = "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")
 
         feature_extractor = WhisperFeatureExtractor()
         processor = WhisperProcessor(feature_extractor, tokenizer)
-        processor.push_to_hub(checkpoint_path, use_auth_token = "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")
+        processor.push_to_hub(checkpoint_path, use_auth_token="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")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index c988ed0ad9579..2ba49c4a13326 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -81,7 +81,7 @@ def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_
 
 
 # Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None ):
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     """
@@ -459,9 +459,9 @@ def _init_weights(self, module):
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (WhisperDecoder, WhisperEncoder)):
             module.gradient_checkpointing = value
-    
-    def _get_feat_extract_output_lengths(self, input:int):
-        return (input-1)//2 +1
+
+    def _get_feat_extract_output_lengths(self, input: int):
+        return (input - 1) // 2 + 1
 
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
@@ -469,7 +469,7 @@ def _get_feature_vector_attention_mask(self, feature_vector_length, attention_ma
         if len(attention_mask.shape) > 2:
             attention_mask = attention_mask[:, :, -1]
 
-        subsampled_lengths = (attention_mask.sum(-1)-1)//2 +1
+        subsampled_lengths = (attention_mask.sum(-1) - 1) // 2 + 1
         bsz = attention_mask.size()[0]
         attention_mask = torch.zeros(
             (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
@@ -675,7 +675,6 @@ def forward(
         hidden_states = inputs_embeds + embed_pos
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
-
         # subsample attention mask if necessary
         if attention_mask is not None:
             attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
@@ -683,7 +682,7 @@ def forward(
             attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
         # else:
         #     attention_mask = torch.ones([], dtype=torch.long, device=inputs_embeds.device)
-  
+        
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -778,15 +777,15 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         # create causal mask
         # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
         combined_attention_mask = None
-    
-        if input_shape[-1] > 1  :
+
+        if input_shape[-1] > 1:
             combined_attention_mask = _make_causal_mask(
                 input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
             ).to(inputs_embeds.device)
 
         if attention_mask is not None:
-            if attention_mask.shape[-1] > input_shape[-1] > 1  :
-                attention_mask = attention_mask[:,:input_shape[-1]]
+            if attention_mask.shape[-1] > input_shape[-1] > 1:
+                attention_mask = attention_mask[:, : input_shape[-1]]
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
             combined_attention_mask = (
@@ -1038,8 +1037,6 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
-
-
     @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1310,21 +1307,19 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
 
-    def _prepare_attention_mask_for_generation(
-        self,
-        inputs: torch.Tensor,
-        pad_token_id: Optional[int],
-        eos_token_id: Optional[int],
-    ) -> torch.LongTensor:
-        is_mel_spec = len(inputs.shape) == 3 and inputs.dtype in [torch.int, torch.long]
-        is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs)
-        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id != eos_token_id)
-
-        # Check if input is input_ids and padded -> only then is attention_mask defined
-        if is_mel_spec and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
-            return inputs.ne(pad_token_id).long()[:, :, : self.max_source_positions, : self.max_source_positions]
-        else:
-            return None
+    # def _prepare_attention_mask_for_generation(
+    #     self,
+    #     inputs: torch.Tensor,
+    #     pad_token_id: Optional[int],
+    #     eos_token_id: Optional[int],
+    # ) -> torch.LongTensor:
+    #     is_mel_spec = len(inputs.shape) == 3 and inputs.dtype in [torch.float32, torch.float16]
+    #     pad_token_id = -0.8060266971588135
+    #     # Check if input is input_ids and padded -> only then is attention_mask defined
+    #     if is_mel_spec :
+    #         return inputs.ne(pad_token_id).long()
+    #     else:
+    #         return None
 
     def _prepare_decoder_input_ids_for_generation(
         self,
@@ -1341,7 +1336,7 @@ def _prepare_decoder_input_ids_for_generation(
             decoder_start_token_id = list(self.config.decoder_start_token_id)
             if device is None:
                 device = self.device
-            return torch.tensor(batch_size * [decoder_start_token_id], dtype=torch.long, device=device) 
+            return torch.tensor(batch_size * [decoder_start_token_id], dtype=torch.long, device=device)
 
     @staticmethod
     def _reorder_cache(past, beam_idx):

From 2859221d3e76bdb48978f2dc93b0caf47b62c07e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 18:20:34 +0000
Subject: [PATCH 042/156] add absrtact

---
 docs/source/en/model_doc/whisper.mdx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
index a5e02c5f1a81b..25ddb6d7c0e83 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -14,12 +14,13 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The Whisper model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+The Whisper model was proposed in [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
 <INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*We study the capabilities of speech processing systems trained simply to predict large amounts of transcripts of audio on the internet. When scaled to 680,000 hours of multilingual and multitask supervision, the resulting models generalize well to standard benchmarks and are often competitive with prior fully supervised results but in a zeroshot transfer setting without the need for any finetuning. When compared to humans, the models approach their accuracy and robustness. We are releasing models and inference code to serve as a foundation for further work on robust speech processing.*
+
 
 Tips:
 

From 9a69dbd5f80709d96ad1a8862ffa14165ce4e09d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 18:38:48 +0000
Subject: [PATCH 043/156] large logits wioth custom decoder input ids

---
 tests/models/whisper/test_modeling_whisper.py | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index bde1f966d23ca..a740a7e9ee469 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -781,10 +781,10 @@ def test_inference_no_head(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.9892, -6.7607,  5.7348,  3.6095,  0.2152, -5.7321,  4.8855, -1.6407,
-                0.2823, -1.5718, 10.4269,  3.4427,  0.0219, -8.0612,  3.4784,  8.4246,
-                4.0575, -2.2864, 11.1084,  0.9963,  0.9884, -8.5154, -3.5469, -9.3714,
-                0.9786,  3.5435,  7.4850, -5.2579, -1.4366, 10.4841
+                2.9892, -6.7607, 5.7348, 3.6095, 0.2152, -5.7321, 4.8855, -1.6407,
+                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
+                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3714,
+                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
             ]
         )
         # fmt: on
@@ -793,10 +793,10 @@ def test_inference_no_head(self):
         # fmt: off
         EXPECTED_GENERATION = torch.tensor(
             [
-                -1.4651, -2.6944,  2.7821,  2.3793,  4.0738,  0.0188, -3.3204,  1.9836,
-                0.0520,  0.7095,  1.1063,  0.2952, -3.6786, -0.5249,  0.3105,  4.7691,
-                1.1562,  1.3046,  0.5810, -0.3624,  1.7006,  1.3424,  0.9817,  2.1958,
-                1.8775, -5.7046, -0.7679,  4.0113,  2.6848,  2.8609
+                -1.4651, -2.6944, 2.7821, 2.3793, 4.0738, 0.0188, -3.3204, 1.9836,
+                0.0520, 0.7095, 1.1063, 0.2952, -3.6786, -0.5249, 0.3105, 4.7691,
+                1.1562, 1.3046, 0.5810, -0.3624, 1.7006, 1.3424, 0.9817, 2.1958,
+                1.8775, -5.7046, -0.7679, 4.0113, 2.6848, 2.8609
             ]
         )
         # fmt: on
@@ -883,10 +883,10 @@ def test_large_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.1807, 1.1505, 4.8049, 3.9549, 2.7182, 4.1885, -0.4179, 2.8316,
-                2.0155, 2.2740, 2.6727, 1.3789, 0.5620, 2.2096, 1.6781, 2.8227,
-                1.4421, 0.9057, 1.3358, 2.2104, 2.7468, 2.0021, 2.6960, 1.5925,
-                2.2239, 1.9396, 4.0580, 5.7722, 4.8056, 4.2416
+                2.1417, 0.9379, 4.4650, 3.5576, 2.4032, 3.8589, -0.6490, 2.5477,
+                1.8330, 1.9925, 2.3441, 1.4747, 0.5453, 2.2641, 1.5200, 2.5393,
+                1.1657, 0.6221, 1.0749, 1.8284, 2.4085, 1.6626, 2.3525, 1.3372,
+                1.9910, 1.8686, 3.8962, 5.3653, 4.4751, 3.9166
             ]
         )
         # fmt: on

From 6f1858d818b8c7c3c7099da37d2bb088191fb02f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 18:39:00 +0000
Subject: [PATCH 044/156] wraap around is otrch available

---
 .../models/whisper/feature_extraction_whisper.py             | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 01c49de281b29..6c8521983f3b8 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -19,7 +19,10 @@
 from typing import List, Optional, Union
 
 import numpy as np
-import torch
+from transformers import is_torch_available
+
+if is_torch_available():
+    import torch
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature

From ac40d6d627eaf1a24e3af518051b703e9db24acd Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 19:12:25 +0000
Subject: [PATCH 045/156] fix feature extractor

---
 .../models/whisper/feature_extraction_whisper.py   | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 6c8521983f3b8..cf84a0b077b1f 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -19,8 +19,10 @@
 from typing import List, Optional, Union
 
 import numpy as np
+
 from transformers import is_torch_available
 
+
 if is_torch_available():
     import torch
 
@@ -224,7 +226,7 @@ def __call__(
         )
 
         if is_batched:
-            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
         elif not is_batched and not isinstance(raw_speech, np.ndarray):
             raw_speech = np.asarray(raw_speech, dtype=np.float32)
         elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
@@ -232,9 +234,9 @@ def __call__(
 
         # always return batch
         if not is_batched:
-            raw_speech = [raw_speech]
-            
-        batched_speech = BatchFeature({"input_features": [np.asarray(raw_speech).T]})
+            raw_speech = [raw_speech.T]
+
+        batched_speech = BatchFeature({"input_features": raw_speech})
 
         # convert into correct format for padding
 
@@ -251,12 +253,12 @@ def __call__(
         input_features = padded_inputs.get("input_features").transpose(0, 2, 1)
         input_features = [self._extract_fbank_features(waveform) for waveform in input_features[0]]
 
-        if isinstance(input_features[0], torch.Tensor) or isinstance(input_features[0],List) :
+        if isinstance(input_features[0], torch.Tensor) or isinstance(input_features[0], List):
             padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
         else:
             padded_inputs["input_features"] = input_features
 
-        attention_mask = np.asarray(padded_inputs.get("attention_mask"))[:,:self.nb_max_frame]
+        attention_mask = np.asarray(padded_inputs.get("attention_mask"))[:, : self.nb_max_frame]
         if attention_mask is not None:
             padded_inputs["attention_mask"] = [attention_mask]
 

From f8f74635285eeb374f3b753f3249cce77a803713 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 19:13:34 +0000
Subject: [PATCH 046/156] correct logits for whisper small.en

---
 tests/models/whisper/test_modeling_whisper.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index a740a7e9ee469..ce0f48ae9d859 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -744,7 +744,6 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 @require_torchaudio
 @require_sentencepiece
 @require_tokenizers
-# @slow
 class WhisperModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_processor(self):
@@ -766,7 +765,7 @@ def test_inference_no_head(self):
         model.to(torch_device)
         input_speech = self._load_datasamples(1)
         feature_extractor = WhisperFeatureExtractor()
-        input_features = feature_extractor(2*input_speech, return_tensors = "pt").input_features
+        input_features = feature_extractor(2 * input_speech, return_tensors="pt").input_features
 
         with torch.no_grad():
             logits = model(
@@ -837,15 +836,16 @@ def test_small_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                -3.6748, -5.8862, -6.7038, -8.0681, -5.9222, -7.5940, -4.6479, -6.2475,
-                -3.6708, -4.0578, -6.5905, -4.0916, -6.9554, -3.9227, -5.4782, -5.1931,
-                -5.8127, -6.6239, -6.4373, -6.8171, -7.0080, -7.8014, -6.8085, -8.3919,
-                -6.5980, -5.6730, -4.6434, -7.4606, -8.5103, -5.4635
+                -3.5023, -5.8727, -7.1252, -8.5208, -6.0207, -7.8296, -4.6376, -6.3990,
+                -3.7516, -4.0411, -6.8055, -3.7937, -6.8897, -3.4925, -5.4489, -5.2272,
+                -5.7970, -6.8300, -6.4165, -6.9162, -7.2233, -8.0165, -6.9419, -8.7574,
+                -6.9695, -5.8984, -4.6315, -8.2338, -8.9415, -5.8150
             ]
         )
         # fmt: on
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
+    @slow
     def test_large_logits_librispeech(self):
 
         torch_device = "cpu"
@@ -915,6 +915,7 @@ def test_generation_en_only(self):
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
+    @slow
     def test_generation(self):
 
         torch_device = "cpu"
@@ -929,9 +930,7 @@ def test_generation(self):
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
-        generated_ids = model.generate(
-            input_features, num_beams=5, decoder_input_ids=torch.tensor([[50258, 50363]])
-        )
+        generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids=torch.tensor([[50258, 50363]]))
         transcript = tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = (
@@ -940,6 +939,7 @@ def test_generation(self):
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
+    @slow
     def test_large_generation(self):
 
         torch_device = "cpu"

From 5540472b4d970f914d626c049e02fb0d7f7a9abb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 19:22:25 +0000
Subject: [PATCH 047/156] nit

---
 src/transformers/models/whisper/feature_extraction_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index cf84a0b077b1f..72fcd583e8505 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -250,7 +250,7 @@ def __call__(
             **kwargs,
         )
         # make sure list is in array format
-        input_features = padded_inputs.get("input_features").transpose(0, 2, 1)
+        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
         input_features = [self._extract_fbank_features(waveform) for waveform in input_features[0]]
 
         if isinstance(input_features[0], torch.Tensor) or isinstance(input_features[0], List):

From 71ac3f751699fb56631a05a8dd3739f9f38c2c34 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 26 Sep 2022 21:04:15 +0000
Subject: [PATCH 048/156] fix encoder_attentino_mask

---
 .../models/whisper/modeling_whisper.py        | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 2ba49c4a13326..78b455aa39831 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -467,7 +467,7 @@ def _get_feature_vector_attention_mask(self, feature_vector_length, attention_ma
         # generate creates 3D attention mask, because of the shape of input_features
         # convert it to 2D if thats the case
         if len(attention_mask.shape) > 2:
-            attention_mask = attention_mask[:, :, -1]
+            attention_mask = attention_mask[:, 0, :]
 
         subsampled_lengths = (attention_mask.sum(-1) - 1) // 2 + 1
         bsz = attention_mask.size()[0]
@@ -682,7 +682,6 @@ def forward(
             attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
         # else:
         #     attention_mask = torch.ones([], dtype=torch.long, device=inputs_embeds.device)
-        
 
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -898,7 +897,6 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
@@ -1111,6 +1109,8 @@ def forward(
         else:
             encoder_attention_mask = None
 
+        
+
         # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
@@ -1307,19 +1307,19 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
 
-    # def _prepare_attention_mask_for_generation(
-    #     self,
-    #     inputs: torch.Tensor,
-    #     pad_token_id: Optional[int],
-    #     eos_token_id: Optional[int],
-    # ) -> torch.LongTensor:
-    #     is_mel_spec = len(inputs.shape) == 3 and inputs.dtype in [torch.float32, torch.float16]
-    #     pad_token_id = -0.8060266971588135
-    #     # Check if input is input_ids and padded -> only then is attention_mask defined
-    #     if is_mel_spec :
-    #         return inputs.ne(pad_token_id).long()
-    #     else:
-    #         return None
+    def _prepare_attention_mask_for_generation(
+        self,
+        inputs: torch.Tensor,
+        pad_token_id: Optional[int],
+        eos_token_id: Optional[int],
+    ) -> torch.LongTensor:
+        is_mel_spec = len(inputs.shape) == 3 and inputs.dtype in [torch.float32, torch.float16]
+        pad_token_id = -0.8060266971588135
+        # Check if input is input_ids and padded -> only then is attention_mask defined
+        if is_mel_spec:
+            return inputs.ne(pad_token_id).long()
+        else:
+            return None
 
     def _prepare_decoder_input_ids_for_generation(
         self,

From 5f4a1f90c590b4cd270a42624ca8fc1b2fb4558c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 26 Sep 2022 22:14:53 +0000
Subject: [PATCH 049/156] some fixes

---
 .../models/whisper/modeling_whisper.py        | 60 +++++++++----------
 src/transformers/tokenization_utils_base.py   |  1 +
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 78b455aa39831..601a5bbe7857e 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1307,37 +1307,37 @@ def prepare_inputs_for_generation(
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
 
-    def _prepare_attention_mask_for_generation(
-        self,
-        inputs: torch.Tensor,
-        pad_token_id: Optional[int],
-        eos_token_id: Optional[int],
-    ) -> torch.LongTensor:
-        is_mel_spec = len(inputs.shape) == 3 and inputs.dtype in [torch.float32, torch.float16]
-        pad_token_id = -0.8060266971588135
+#    def _prepare_attention_mask_for_generation(
+#        self,
+#        inputs: torch.Tensor,
+#        pad_token_id: Optional[int],
+#        eos_token_id: Optional[int],
+#    ) -> torch.LongTensor:
+#        is_mel_spec = len(inputs.shape) == 3 and inputs.dtype in [torch.float32, torch.float16]
+#        pad_token_id = -0.8060266971588135
         # Check if input is input_ids and padded -> only then is attention_mask defined
-        if is_mel_spec:
-            return inputs.ne(pad_token_id).long()
-        else:
-            return None
-
-    def _prepare_decoder_input_ids_for_generation(
-        self,
-        batch_size: int,
-        decoder_start_token_id: int = None,
-        bos_token_id: int = None,
-        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        device: torch.device = None,
-    ) -> torch.LongTensor:
-
-        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
-            return model_kwargs.pop("decoder_input_ids")
-        else:
-            decoder_start_token_id = list(self.config.decoder_start_token_id)
-            if device is None:
-                device = self.device
-            return torch.tensor(batch_size * [decoder_start_token_id], dtype=torch.long, device=device)
-
+#        if is_mel_spec:
+#            return inputs.ne(pad_token_id).long()
+#        else:
+#            return None
+
+#    def _prepare_decoder_input_ids_for_generation(
+#        self,
+#        batch_size: int,
+#        decoder_start_token_id: int = None,
+#        bos_token_id: int = None,
+#        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+#        device: torch.device = None,
+#    ) -> torch.LongTensor:
+#
+#        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+#            return model_kwargs.pop("decoder_input_ids")
+#        else:
+#            decoder_start_token_id = list(self.config.decoder_start_token_id)
+#            if device is None:
+#                device = self.device
+#            return torch.tensor(batch_size * [decoder_start_token_id], dtype=torch.long, device=device)
+#
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 54d562136db4a..65eda3fb0ba32 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1822,6 +1822,7 @@ def _from_pretrained(
         if tokenizer_config_file is not None:
             with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                 init_kwargs = json.load(tokenizer_config_handle)
+
             # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
             config_tokenizer_class = init_kwargs.get("tokenizer_class")
             init_kwargs.pop("tokenizer_class", None)

From cda4759e7f1ed3f17bda47b1b7131e7795886e91 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 26 Sep 2022 22:29:15 +0000
Subject: [PATCH 050/156] remove unnecessary inputs

---
 src/transformers/models/whisper/modeling_whisper.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 601a5bbe7857e..0bd206c2663e9 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1284,10 +1284,6 @@ def prepare_inputs_for_generation(
         self,
         decoder_input_ids,
         past=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
         **kwargs
@@ -1301,10 +1297,7 @@ def prepare_inputs_for_generation(
             "past_key_values": past,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+            "use_cache": use_cache,
         }
 
 #    def _prepare_attention_mask_for_generation(

From 7b892ddd8cf4ba297080b3a9e6d4aa42f8706fe7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 05:35:34 +0000
Subject: [PATCH 051/156] nits

---
 .../models/whisper/feature_extraction_whisper.py  |  2 +-
 tests/models/whisper/test_modeling_whisper.py     | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 72fcd583e8505..5e97bdafd04a4 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -234,7 +234,7 @@ def __call__(
 
         # always return batch
         if not is_batched:
-            raw_speech = [raw_speech.T]
+            raw_speech = [np.asarray([raw_speech]).T]
 
         batched_speech = BatchFeature({"input_features": raw_speech})
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index ce0f48ae9d859..a5855a5d02dda 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -108,7 +108,7 @@ def __init__(
         eos_token_id=98,
         pad_token_id=0,
         num_mel_bins=80,
-        decoder_start_token_id=[85, 87],
+        decoder_start_token_id=85,
         num_conv_layers=1,
     ):
         self.parent = parent
@@ -139,7 +139,7 @@ def prepare_config_and_inputs(self):
         attention_mask = torch.ones(
             [self.batch_size, self.max_source_positions], dtype=torch.long, device=torch_device
         )
-        decoder_input_ids = torch.tensor(self.batch_size * [self.decoder_start_token_id], device=torch_device)
+        decoder_input_ids = torch.tensor(self.batch_size * [[self.decoder_start_token_id]], device=torch_device)
 
         config = self.get_config()
         inputs_dict = prepare_whisper_inputs_dict(
@@ -338,6 +338,9 @@ def test_training(self):
     def test_training_gradient_checkpointing(self):
         pass
 
+    def test_generate_with_head_masking(self):
+        pass
+    
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         config.max_target_positions = 400
@@ -406,7 +409,7 @@ def check_hidden_states_output(inputs_dict, config, model_class):
                 self.assertIsInstance(hidden_states, (list, tuple))
                 self.assertEqual(len(hidden_states), expected_num_layers)
 
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 2)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 1)
 
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
@@ -430,9 +433,9 @@ def test_attention_outputs(self):
         config.return_dict = True
 
         seq_len = getattr(self.model_tester, "seq_length", None)
-        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 2)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", 1)
         encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        decoder_key_length = getattr(self.model_tester, "decoder_key_length", 2)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", 1)
         encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
 
         for model_class in self.all_model_classes:
@@ -930,7 +933,7 @@ def test_generation(self):
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
-        generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids=torch.tensor([[50258, 50363]]))
+        generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids=torch.tensor([[50257, 50258, 50362]]))
         transcript = tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = (

From d49614b1fad17bbc91880935129eaea8c90c1ef2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 05:57:09 +0000
Subject: [PATCH 052/156] add normalizer file

---
 .../models/whisper/english_normalizer.py      | 557 ++++++++++++++++++
 1 file changed, 557 insertions(+)
 create mode 100644 src/transformers/models/whisper/english_normalizer.py

diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
new file mode 100644
index 0000000000000..3a6626d028532
--- /dev/null
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -0,0 +1,557 @@
+# Copyright 2022 The OpenAI team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+from fractions import Fraction
+from typing import Iterator, List, Match, Optional, Union
+
+from more_itertools import windowed
+
+from .basic import remove_symbols_and_diacritics
+
+
+class EnglishNumberNormalizer:
+    """
+    Convert any spelled-out numbers into arabic numbers, while handling:
+
+    - remove any commas
+    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
+    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
+    - spell out `one` and `ones`
+    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.zeros = {"o", "oh", "zero"}
+        self.ones = {
+            name: i
+            for i, name in enumerate(
+                [
+                    "one",
+                    "two",
+                    "three",
+                    "four",
+                    "five",
+                    "six",
+                    "seven",
+                    "eight",
+                    "nine",
+                    "ten",
+                    "eleven",
+                    "twelve",
+                    "thirteen",
+                    "fourteen",
+                    "fifteen",
+                    "sixteen",
+                    "seventeen",
+                    "eighteen",
+                    "nineteen",
+                ],
+                start=1,
+            )
+        }
+        self.ones_plural = {
+            "sixes" if name == "six" else name + "s": (value, "s")
+            for name, value in self.ones.items()
+        }
+        self.ones_ordinal = {
+            "zeroth": (0, "th"),
+            "first": (1, "st"),
+            "second": (2, "nd"),
+            "third": (3, "rd"),
+            "fifth": (5, "th"),
+            "twelfth": (12, "th"),
+            **{
+                name + ("h" if name.endswith("t") else "th"): (value, "th")
+                for name, value in self.ones.items()
+                if value > 3 and value != 5 and value != 12
+            },
+        }
+        self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
+
+        self.tens = {
+            "twenty": 20,
+            "thirty": 30,
+            "forty": 40,
+            "fifty": 50,
+            "sixty": 60,
+            "seventy": 70,
+            "eighty": 80,
+            "ninety": 90,
+        }
+        self.tens_plural = {
+            name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
+        }
+        self.tens_ordinal = {
+            name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()
+        }
+        self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
+
+        self.multipliers = {
+            "hundred": 100,
+            "thousand": 1_000,
+            "million": 1_000_000,
+            "billion": 1_000_000_000,
+            "trillion": 1_000_000_000_000,
+            "quadrillion": 1_000_000_000_000_000,
+            "quintillion": 1_000_000_000_000_000_000,
+            "sextillion": 1_000_000_000_000_000_000_000,
+            "septillion": 1_000_000_000_000_000_000_000_000,
+            "octillion": 1_000_000_000_000_000_000_000_000_000,
+            "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
+            "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
+        }
+        self.multipliers_plural = {
+            name + "s": (value, "s") for name, value in self.multipliers.items()
+        }
+        self.multipliers_ordinal = {
+            name + "th": (value, "th") for name, value in self.multipliers.items()
+        }
+        self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal}
+        self.decimals = {*self.ones, *self.tens, *self.zeros}
+
+        self.preceding_prefixers = {
+            "minus": "-",
+            "negative": "-",
+            "plus": "+",
+            "positive": "+",
+        }
+        self.following_prefixers = {
+            "pound": "£",
+            "pounds": "£",
+            "euro": "€",
+            "euros": "€",
+            "dollar": "$",
+            "dollars": "$",
+            "cent": "¢",
+            "cents": "¢",
+        }
+        self.prefixes = set(
+            list(self.preceding_prefixers.values()) + list(self.following_prefixers.values())
+        )
+        self.suffixers = {
+            "per": {"cent": "%"},
+            "percent": "%",
+        }
+        self.specials = {"and", "double", "triple", "point"}
+
+        self.words = set(
+            [
+                key
+                for mapping in [
+                    self.zeros,
+                    self.ones,
+                    self.ones_suffixed,
+                    self.tens,
+                    self.tens_suffixed,
+                    self.multipliers,
+                    self.multipliers_suffixed,
+                    self.preceding_prefixers,
+                    self.following_prefixers,
+                    self.suffixers,
+                    self.specials,
+                ]
+                for key in mapping
+            ]
+        )
+        self.literal_words = {"one", "ones"}
+
+    def process_words(self, words: List[str]) -> Iterator[str]:
+        prefix: Optional[str] = None
+        value: Optional[Union[str, int]] = None
+        skip = False
+
+        def to_fraction(s: str):
+            try:
+                return Fraction(s)
+            except ValueError:
+                return None
+
+        def output(result: Union[str, int]):
+            nonlocal prefix, value
+            result = str(result)
+            if prefix is not None:
+                result = prefix + result
+            value = None
+            prefix = None
+            return result
+
+        if len(words) == 0:
+            return
+
+        for prev, current, next in windowed([None] + words + [None], 3):
+            if skip:
+                skip = False
+                continue
+
+            next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
+            has_prefix = current[0] in self.prefixes
+            current_without_prefix = current[1:] if has_prefix else current
+            if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
+                # arabic numbers (potentially with signs and fractions)
+                f = to_fraction(current_without_prefix)
+                assert f is not None
+                if value is not None:
+                    if isinstance(value, str) and value.endswith("."):
+                        # concatenate decimals / ip address components
+                        value = str(value) + str(current)
+                        continue
+                    else:
+                        yield output(value)
+
+                prefix = current[0] if has_prefix else prefix
+                if f.denominator == 1:
+                    value = f.numerator  # store integers as int
+                else:
+                    value = current_without_prefix
+            elif current not in self.words:
+                # non-numeric words
+                if value is not None:
+                    yield output(value)
+                yield output(current)
+            elif current in self.zeros:
+                value = str(value or "") + "0"
+            elif current in self.ones:
+                ones = self.ones[current]
+
+                if value is None:
+                    value = ones
+                elif isinstance(value, str) or prev in self.ones:
+                    if prev in self.tens and ones < 10:  # replace the last zero with the digit
+                        assert value[-1] == "0"
+                        value = value[:-1] + str(ones)
+                    else:
+                        value = str(value) + str(ones)
+                elif ones < 10:
+                    if value % 10 == 0:
+                        value += ones
+                    else:
+                        value = str(value) + str(ones)
+                else:  # eleven to nineteen
+                    if value % 100 == 0:
+                        value += ones
+                    else:
+                        value = str(value) + str(ones)
+            elif current in self.ones_suffixed:
+                # ordinal or cardinal; yield the number right away
+                ones, suffix = self.ones_suffixed[current]
+                if value is None:
+                    yield output(str(ones) + suffix)
+                elif isinstance(value, str) or prev in self.ones:
+                    if prev in self.tens and ones < 10:
+                        assert value[-1] == "0"
+                        yield output(value[:-1] + str(ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                elif ones < 10:
+                    if value % 10 == 0:
+                        yield output(str(value + ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                else:  # eleven to nineteen
+                    if value % 100 == 0:
+                        yield output(str(value + ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                value = None
+            elif current in self.tens:
+                tens = self.tens[current]
+                if value is None:
+                    value = tens
+                elif isinstance(value, str):
+                    value = str(value) + str(tens)
+                else:
+                    if value % 100 == 0:
+                        value += tens
+                    else:
+                        value = str(value) + str(tens)
+            elif current in self.tens_suffixed:
+                # ordinal or cardinal; yield the number right away
+                tens, suffix = self.tens_suffixed[current]
+                if value is None:
+                    yield output(str(tens) + suffix)
+                elif isinstance(value, str):
+                    yield output(str(value) + str(tens) + suffix)
+                else:
+                    if value % 100 == 0:
+                        yield output(str(value + tens) + suffix)
+                    else:
+                        yield output(str(value) + str(tens) + suffix)
+            elif current in self.multipliers:
+                multiplier = self.multipliers[current]
+                if value is None:
+                    value = multiplier
+                elif isinstance(value, str) or value == 0:
+                    f = to_fraction(value)
+                    p = f * multiplier if f is not None else None
+                    if f is not None and p.denominator == 1:
+                        value = p.numerator
+                    else:
+                        yield output(value)
+                        value = multiplier
+                else:
+                    before = value // 1000 * 1000
+                    residual = value % 1000
+                    value = before + residual * multiplier
+            elif current in self.multipliers_suffixed:
+                multiplier, suffix = self.multipliers_suffixed[current]
+                if value is None:
+                    yield output(str(multiplier) + suffix)
+                elif isinstance(value, str):
+                    f = to_fraction(value)
+                    p = f * multiplier if f is not None else None
+                    if f is not None and p.denominator == 1:
+                        yield output(str(p.numerator) + suffix)
+                    else:
+                        yield output(value)
+                        yield output(str(multiplier) + suffix)
+                else:  # int
+                    before = value // 1000 * 1000
+                    residual = value % 1000
+                    value = before + residual * multiplier
+                    yield output(str(value) + suffix)
+                value = None
+            elif current in self.preceding_prefixers:
+                # apply prefix (positive, minus, etc.) if it precedes a number
+                if value is not None:
+                    yield output(value)
+
+                if next in self.words or next_is_numeric:
+                    prefix = self.preceding_prefixers[current]
+                else:
+                    yield output(current)
+            elif current in self.following_prefixers:
+                # apply prefix (dollars, cents, etc.) only after a number
+                if value is not None:
+                    prefix = self.following_prefixers[current]
+                    yield output(value)
+                else:
+                    yield output(current)
+            elif current in self.suffixers:
+                # apply suffix symbols (percent -> '%')
+                if value is not None:
+                    suffix = self.suffixers[current]
+                    if isinstance(suffix, dict):
+                        if next in suffix:
+                            yield output(str(value) + suffix[next])
+                            skip = True
+                        else:
+                            yield output(value)
+                            yield output(current)
+                    else:
+                        yield output(str(value) + suffix)
+                else:
+                    yield output(current)
+            elif current in self.specials:
+                if next not in self.words and not next_is_numeric:
+                    # apply special handling only if the next word can be numeric
+                    if value is not None:
+                        yield output(value)
+                    yield output(current)
+                elif current == "and":
+                    # ignore "and" after hundreds, thousands, etc.
+                    if prev not in self.multipliers:
+                        if value is not None:
+                            yield output(value)
+                        yield output(current)
+                elif current == "double" or current == "triple":
+                    if next in self.ones or next in self.zeros:
+                        repeats = 2 if current == "double" else 3
+                        ones = self.ones.get(next, 0)
+                        value = str(value or "") + str(ones) * repeats
+                        skip = True
+                    else:
+                        if value is not None:
+                            yield output(value)
+                        yield output(current)
+                elif current == "point":
+                    if next in self.decimals or next_is_numeric:
+                        value = str(value or "") + "."
+                else:
+                    # should all have been covered at this point
+                    raise ValueError(f"Unexpected token: {current}")
+            else:
+                # all should have been covered at this point
+                raise ValueError(f"Unexpected token: {current}")
+
+        if value is not None:
+            yield output(value)
+
+    def preprocess(self, s: str):
+        # replace "<number> and a half" with "<number> point five"
+        results = []
+
+        segments = re.split(r"\band\s+a\s+half\b", s)
+        for i, segment in enumerate(segments):
+            if len(segment.strip()) == 0:
+                continue
+            if i == len(segments) - 1:
+                results.append(segment)
+            else:
+                results.append(segment)
+                last_word = segment.rsplit(maxsplit=2)[-1]
+                if last_word in self.decimals or last_word in self.multipliers:
+                    results.append("point five")
+                else:
+                    results.append("and a half")
+
+        s = " ".join(results)
+
+        # put a space at number/letter boundary
+        s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
+        s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
+
+        # but remove spaces which could be a suffix
+        s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
+
+        return s
+
+    def postprocess(self, s: str):
+        def combine_cents(m: Match):
+            try:
+                currency = m.group(1)
+                integer = m.group(2)
+                cents = int(m.group(3))
+                return f"{currency}{integer}.{cents:02d}"
+            except ValueError:
+                return m.string
+
+        def extract_cents(m: Match):
+            try:
+                return f"¢{int(m.group(1))}"
+            except ValueError:
+                return m.string
+
+        # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
+        s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
+        s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
+
+        # write "one(s)" instead of "1(s)", just for the readability
+        s = re.sub(r"\b1(s?)\b", r"one\1", s)
+
+        return s
+
+    def __call__(self, s: str):
+        s = self.preprocess(s)
+        s = " ".join(word for word in self.process_words(s.split()) if word is not None)
+        s = self.postprocess(s)
+
+        return s
+
+
+class EnglishSpellingNormalizer:
+    """
+    Applies British-American spelling mappings as listed in [1].
+
+    [1] https://www.tysto.com/uk-us-spelling-list.html
+    """
+
+    def __init__(self):
+        mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
+        self.mapping = json.load(open(mapping_path))
+
+    def __call__(self, s: str):
+        return " ".join(self.mapping.get(word, word) for word in s.split())
+
+
+class EnglishTextNormalizer:
+    def __init__(self):
+        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
+        self.replacers = {
+            # common contractions
+            r"\bwon't\b": "will not",
+            r"\bcan't\b": "can not",
+            r"\blet's\b": "let us",
+            r"\bain't\b": "aint",
+            r"\by'all\b": "you all",
+            r"\bwanna\b": "want to",
+            r"\bgotta\b": "got to",
+            r"\bgonna\b": "going to",
+            r"\bi'ma\b": "i am going to",
+            r"\bimma\b": "i am going to",
+            r"\bwoulda\b": "would have",
+            r"\bcoulda\b": "could have",
+            r"\bshoulda\b": "should have",
+            r"\bma'am\b": "madam",
+            # contractions in titles/prefixes
+            r"\bmr\b": "mister ",
+            r"\bmrs\b": "missus ",
+            r"\bst\b": "saint ",
+            r"\bdr\b": "doctor ",
+            r"\bprof\b": "professor ",
+            r"\bcapt\b": "captain ",
+            r"\bgov\b": "governor ",
+            r"\bald\b": "alderman ",
+            r"\bgen\b": "general ",
+            r"\bsen\b": "senator ",
+            r"\brep\b": "representative ",
+            r"\bpres\b": "president ",
+            r"\brev\b": "reverend ",
+            r"\bhon\b": "honorable ",
+            r"\basst\b": "assistant ",
+            r"\bassoc\b": "associate ",
+            r"\blt\b": "lieutenant ",
+            r"\bcol\b": "colonel ",
+            r"\bjr\b": "junior ",
+            r"\bsr\b": "senior ",
+            r"\besq\b": "esquire ",
+            # prefect tenses, ideally it should be any past participles, but it's harder..
+            r"'d been\b": " had been",
+            r"'s been\b": " has been",
+            r"'d gone\b": " had gone",
+            r"'s gone\b": " has gone",
+            r"'d done\b": " had done",  # "'s done" is ambiguous
+            r"'s got\b": " has got",
+            # general contractions
+            r"n't\b": " not",
+            r"'re\b": " are",
+            r"'s\b": " is",
+            r"'d\b": " would",
+            r"'ll\b": " will",
+            r"'t\b": " not",
+            r"'ve\b": " have",
+            r"'m\b": " am",
+        }
+        self.standardize_numbers = EnglishNumberNormalizer()
+        self.standardize_spellings = EnglishSpellingNormalizer()
+
+    def __call__(self, s: str):
+        s = s.lower()
+
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = re.sub(self.ignore_patterns, "", s)
+        s = re.sub(r"\s+'", "'", s)  # standardize when there's a space before an apostrophe
+
+        for pattern, replacement in self.replacers.items():
+            s = re.sub(pattern, replacement, s)
+
+        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
+        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
+        s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep some symbols for numerics
+
+        s = self.standardize_numbers(s)
+        s = self.standardize_spellings(s)
+
+        # now remove prefix/suffix symbols that are not preceded/followed by numbers
+        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
+        s = re.sub(r"([^0-9])%", r"\1 ", s)
+
+        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
+
+        return s

From 171e03460f7ea86b2b0ca2cdc0faa1a766a93099 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 07:12:26 +0000
Subject: [PATCH 053/156] update etst tokenization

---
 .../models/whisper/tokenization_whisper.py    |   3 -
 .../whisper/test_tokenization_whisper.py      | 172 ++++--------------
 2 files changed, 36 insertions(+), 139 deletions(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index b120bff20671a..829bdaf244868 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -367,9 +367,6 @@ def sot_sequence_including_notimestamps(self) -> Tuple[int]:
     def vocab_size(self) -> int:
         return len(self.encoder)
 
-    @property
-    def tgt_lang(self) -> str:
-        return self._tgt_lang
 
     def bpe(self, token):
         if token in self.cache:
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 19264f39f751c..c5a8f58495404 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -21,8 +21,8 @@
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
-FR_CODE = 5
-ES_CODE = 10
+EN_CODE = 50258
+ES_CODE = 50256
 
 
 @require_sentencepiece
@@ -34,7 +34,7 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
 
     def setUp(self):
         super().setUp()
-        tokenizer = WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper/tiny")
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
         tokenizer.save_pretrained(self.tmpdirname)
 
     def test_convert_token_and_id(self):
@@ -93,14 +93,14 @@ def test_tokenizer_integration(self):
 
         self.tokenizer_integration_test_util(
             expected_encoding=expected_encoding,
-            model_name="facebook/s2t-small-mustc-en-de-st",
+            model_name="opneai/whispoer-tiny",
             revision="a14f04cf0776c02f62a8cb800cf7909e15ea23ad",
         )
 
 
 @require_sentencepiece
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
-    checkpoint_name = "ArthurZ/whisper-small.eng"
+    checkpoint_name = "openai/whisper-small.en"
 
     transcript = (
         "'<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Nor is Mr. Quilters manner less interesting"
@@ -115,10 +115,9 @@ def setUpClass(cls):
 
     def test_tokenizer_equivalence(self):
         text = "다람쥐 헌 쳇바퀴에 타고파"
-        multilingual_tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small")
-        gpt2_tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.eng")
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny",language="ko")
+        gpt2_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
 
-        text = "다람쥐 헌 쳇바퀴에 타고파"
         gpt2_tokens = gpt2_tokenizer.encode(text)
         multilingual_tokens = multilingual_tokenizer.encode(text)
 
@@ -126,146 +125,47 @@ def test_tokenizer_equivalence(self):
         assert multilingual_tokenizer.decode(multilingual_tokens) == text
         assert len(gpt2_tokens) > len(multilingual_tokens)
 
-        EXPECTED_MULTI = [
-            9835,
-            22855,
-            168,
-            98,
-            238,
-            13431,
-            234,
-            43517,
-            229,
-            47053,
-            169,
-            222,
-            19086,
-            19840,
-            1313,
-            17974,
+        # fmt: off
+        EXPECTED_ENG= [
+            46695, 97, 167, 252, 234, 168, 98, 238, 220, 169,
+            245, 234, 23821, 111, 229, 167, 108, 242, 169, 222,
+            112, 168, 245, 238, 220, 169, 225, 222, 166, 111,
+            254, 169, 234, 234
         ]
-
-        EXPECTED_ENG = [
-            46695,
-            97,
-            167,
-            252,
-            234,
-            168,
-            98,
-            238,
-            220,
-            169,
-            245,
-            234,
-            23821,
-            111,
-            229,
-            167,
-            108,
-            242,
-            169,
-            222,
-            112,
-            168,
-            245,
-            238,
-            220,
-            169,
-            225,
-            222,
-            166,
-            111,
-            254,
-            169,
-            234,
-            234,
+        EXPECTED_MULTI = [
+            9835, 22855, 168, 98, 238, 13431, 234, 43517, 229, 47053,
+            169, 222, 19086, 19840, 1313, 17974
         ]
+        # fmt: on
 
         self.assertListEqual(gpt2_tokens, EXPECTED_ENG)
         self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
 
     def test_tokenizer_special(self):
-        multilingual_tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.eng")
-        text = "[Denis] Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
+        text = "<|startoftranscript|> Hey! How are you feeling? J'ai l'impression que 郷さん est prêt <|endoftext|>"
+
         multilingual_tokens = multilingual_tokenizer.encode(text)
 
+        # fmt: off
         EXPECTED_MULTI = [
-            58,
-            35,
-            268,
-            271,
-            60,
-            1911,
-            0,
-            1012,
-            366,
-            291,
-            2633,
-            30,
-            508,
-            6,
-            1301,
-            287,
-            6,
-            36107,
-            631,
-            220,
-            11178,
-            115,
-            15567,
-            871,
-            44393,
+            50258, 1911, 0, 1012, 366, 291, 2633, 30, 508, 6,
+            1301, 287, 6, 36107, 631, 220, 11178, 115, 15567, 871,
+            44393, 220, 50257
         ]
+        # fmt: on
+
         self.assertListEqual(multilingual_tokens, EXPECTED_MULTI)
 
         self.assertEqual(text, multilingual_tokenizer.decode(multilingual_tokens))
 
-        jp_tokenizer = WhisperTokenizer.from_pretrained(
-            "/home/arthur_huggingface_co/transformers/whisper/tiny-multy", multilingual=False, language="japanese"
-        )
-        EXPECTED_JAP = [
-            58,
-            21306,
-            271,
-            60,
-            14690,
-            0,
-            1374,
-            389,
-            345,
-            4203,
-            30,
-            449,
-            6,
-            1872,
-            300,
-            6,
-            11011,
-            2234,
-            8358,
-            16268,
-            225,
-            115,
-            43357,
-            22174,
-            1556,
-            778,
-            25792,
-            83,
-        ]
-
-        # parameters of the original tokenizer : multilingual False, language=Japanese
-        self.assertListEqual(jp_tokenizer.encode(text), EXPECTED_JAP)
+        transcript =  multilingual_tokenizer.decode(multilingual_tokens,skip_special_tokens=True)
 
-    def check_language_codes(self):
-        self.assertEqual(self.tokenizer.lang_code_to_id["pt"], 4)
-        self.assertEqual(self.tokenizer.lang_code_to_id["ru"], 6)
-        self.assertEqual(self.tokenizer.lang_code_to_id["it"], 9)
-        self.assertEqual(self.tokenizer.lang_code_to_id["de"], 11)
+        EXPECTED_JAP = "  Hey! How are you feeling? J'ai l'impression que 郷さん est prêt "
+        self.assertListEqual(transcript, EXPECTED_JAP)
 
     def test_vocab_size(self):
-        self.assertEqual(self.tokenizer.vocab_size, 10_000)
+        self.assertEqual(self.tokenizer.vocab_size, 50257)
 
     def test_tokenizer_decode_ignores_language_codes(self):
         self.assertIn(ES_CODE, self.tokenizer.all_special_ids)
@@ -276,14 +176,14 @@ def test_tokenizer_decode_ignores_language_codes(self):
         self.assertNotIn(self.tokenizer.eos_token, result)
 
     def test_tokenizer_adds_special_tokens(self):
-        self.tokenizer.tgt_lang = "fr"
+        self.tokenizer.language_token = "fr"
         encoded = self.tokenizer(self.french_text).input_ids
-        self.assertEqual(encoded[0], FR_CODE)
+        self.assertEqual(encoded[0], EN_CODE)
         self.assertEqual(encoded[-1], self.tokenizer.eos_token_id)
 
-    def test_tgt_lang_setter(self):
-        self.tokenizer.tgt_lang = "fr"
-        self.assertListEqual(self.tokenizer.prefix_tokens, [FR_CODE])
+    def test_language_token_setter(self):
+        self.tokenizer.language_token = "en"
+        self.assertListEqual(self.tokenizer.prefix_tokens, [EN_CODE])
 
-        self.tokenizer.tgt_lang = "es"
+        self.tokenizer.language_token = "es"
         self.assertListEqual(self.tokenizer.prefix_tokens, [ES_CODE])

From cae0269edad0ac990857eac50973f0a70a684894 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 08:23:32 +0000
Subject: [PATCH 054/156] fix attention mask not defined

---
 src/transformers/models/whisper/modeling_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 0bd206c2663e9..3c3623861ac14 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1296,7 +1296,7 @@ def prepare_inputs_for_generation(
             "encoder_outputs": encoder_outputs,
             "past_key_values": past,
             "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
+            "attention_mask": None,
             "use_cache": use_cache,
         }
 

From de47019c0f5ff2713e6eaf63001c27aee75c7c56 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 27 Sep 2022 08:47:39 +0000
Subject: [PATCH 055/156] Add model to README

---
 README.md                                          |  2 +-
 README_ko.md                                       |  2 +-
 README_zh-hans.md                                  |  2 +-
 README_zh-hant.md                                  |  2 +-
 docs/source/en/index.mdx                           |  2 +-
 .../models/whisper/modeling_whisper.py             | 14 ++++++++++----
 6 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index fa3b28b5b45b1..069ff3164683e 100644
--- a/README.md
+++ b/README.md
@@ -390,7 +390,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
diff --git a/README_ko.md b/README_ko.md
index 372302b18a9af..84ecb811edeab 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -340,7 +340,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 9f8199f41eb8a..bc6f06d77b74c 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -364,7 +364,7 @@ conda install -c huggingface transformers
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (来自 Facebook AI) 伴随论文 [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino 发布。
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (来自 OpenAI) 伴随论文 [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) 由 Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever 发布。
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (来自 Microsoft Research) 伴随论文 [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) 由 Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling 发布。
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index cac8a736ce61e..5e33a9036f53b 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -376,7 +376,7 @@ conda install -c huggingface transformers
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[Whisper](https://huggingface.co/docs/transformers/main/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
 1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index d37ab1cbc311c..882d79472e195 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -180,7 +180,7 @@ The documentation is organized into five sections:
 1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
 1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
-1. **[Whisper](model_doc/whisper)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>.
+1. **[Whisper](model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
 1. **[X-CLIP](model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
 1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 3c3623861ac14..d77ca5e7ade77 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -460,16 +460,22 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (WhisperDecoder, WhisperEncoder)):
             module.gradient_checkpointing = value
 
-    def _get_feat_extract_output_lengths(self, input: int):
-        return (input - 1) // 2 + 1
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        for i in range(self.config.num_conv_layers):
+            input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
 
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
         # convert it to 2D if thats the case
         if len(attention_mask.shape) > 2:
-            attention_mask = attention_mask[:, 0, :]
+            attention_mask = attention_mask[:, :, -1]
 
-        subsampled_lengths = (attention_mask.sum(-1) - 1) // 2 + 1
+        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
         bsz = attention_mask.size()[0]
         attention_mask = torch.zeros(
             (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device

From 9a8f99f77a10506b58068e199fbf0f49c5f872ac Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Tue, 27 Sep 2022 09:25:56 +0000
Subject: [PATCH 056/156] Fix doc tests

---
 .../models/whisper/modeling_whisper.py        | 27 +++++++------------
 utils/documentation_tests.txt                 |  1 +
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index d77ca5e7ade77..90fed9b1fb492 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -460,22 +460,16 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (WhisperDecoder, WhisperEncoder)):
             module.gradient_checkpointing = value
 
-    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
-        """
-        Computes the output length of the convolutional layers
-        """
-        for i in range(self.config.num_conv_layers):
-            input_lengths = (input_lengths - 1) // 2 + 1
-
-        return input_lengths
+    def _get_feat_extract_output_lengths(self, input: int):
+        return (input - 1) // 2 + 1
 
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
         # convert it to 2D if thats the case
         if len(attention_mask.shape) > 2:
-            attention_mask = attention_mask[:, :, -1]
+            attention_mask = attention_mask[:, 0, :]
 
-        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+        subsampled_lengths = (attention_mask.sum(-1) - 1) // 2 + 1
         bsz = attention_mask.size()[0]
         attention_mask = torch.zeros(
             (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
@@ -1080,7 +1074,7 @@ def forward(
          >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
          >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
          >>> list(last_hidden_state.shape)
-         [1, 2, 256]
+         [1, 2, 512]
          ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1222,10 +1216,9 @@ def forward(
         >>> from transformers import WhisperProcessor, WhisperForConditionalGeneration
         >>> from datasets import load_dataset
 
-        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
-        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-base")
-
-
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 
         >>> inputs = processor(
@@ -1235,9 +1228,9 @@ def forward(
 
         >>> generated_ids = model.generate(inputs=input_features)
 
-        >>> transcription = processor.batch_decode(generated_ids)[0]
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         >>> transcription
-        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
+        ' The quilter is the apostle of the middle classes and we are glad to welcome his'
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 48fc71d6f6b2e..3cc9e719174eb 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -92,4 +92,5 @@ src/transformers/models/wav2vec2/tokenization_wav2vec2.py
 src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
 src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
 src/transformers/models/wavlm/modeling_wavlm.py
+src/transformers/models/whisper/modeling_whisper.py
 src/transformers/models/yolos/modeling_yolos.py

From 12b1ca5106365b1fdfc2307ce3c8711121267fe6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 09:27:59 +0000
Subject: [PATCH 057/156] fix generate

---
 .../whisper/feature_extraction_whisper.py     |  3 +-
 .../models/whisper/modeling_whisper.py        | 46 ++++---------------
 2 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 5e97bdafd04a4..b1486af8f90ef 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -158,7 +158,7 @@ def _extract_fbank_features(
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
-        truncation: bool = False,
+        truncation: bool = True,
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
@@ -251,6 +251,7 @@ def __call__(
         )
         # make sure list is in array format
         input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
+            
         input_features = [self._extract_fbank_features(waveform) for waveform in input_features[0]]
 
         if isinstance(input_features[0], torch.Tensor) or isinstance(input_features[0], List):
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 3c3623861ac14..467ee6d62f238 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -469,7 +469,7 @@ def _get_feature_vector_attention_mask(self, feature_vector_length, attention_ma
         if len(attention_mask.shape) > 2:
             attention_mask = attention_mask[:, 0, :]
 
-        subsampled_lengths = (attention_mask.sum(-1) - 1) // 2 + 1
+        subsampled_lengths = ((attention_mask.sum(-1) - 1) // 2) + 1
         bsz = attention_mask.size()[0]
         attention_mask = torch.zeros(
             (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
@@ -675,6 +675,7 @@ def forward(
         hidden_states = inputs_embeds + embed_pos
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
+        attention_mask = None
         # subsample attention mask if necessary
         if attention_mask is not None:
             attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
@@ -783,8 +784,8 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
             ).to(inputs_embeds.device)
 
         if attention_mask is not None:
-            if attention_mask.shape[-1] > input_shape[-1] > 1:
-                attention_mask = attention_mask[:, : input_shape[-1]]
+            if attention_mask.shape[-1] > input_shape[-1] > 0:
+                attention_mask = attention_mask[:, : input_shape[-1]+past_key_values_length]
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
             combined_attention_mask = (
@@ -900,7 +901,8 @@ def forward(
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
-
+        attention_mask = None
+        encoder_attention_mask = None
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1109,7 +1111,8 @@ def forward(
         else:
             encoder_attention_mask = None
 
-        
+        encoder_attention_mask = None
+        decoder_attention_mask = None
 
         # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
@@ -1297,39 +1300,10 @@ def prepare_inputs_for_generation(
             "past_key_values": past,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": None,
+            "encoder_attention_mask":None,
             "use_cache": use_cache,
+            "decoder_attention_mask":None
         }
-
-#    def _prepare_attention_mask_for_generation(
-#        self,
-#        inputs: torch.Tensor,
-#        pad_token_id: Optional[int],
-#        eos_token_id: Optional[int],
-#    ) -> torch.LongTensor:
-#        is_mel_spec = len(inputs.shape) == 3 and inputs.dtype in [torch.float32, torch.float16]
-#        pad_token_id = -0.8060266971588135
-        # Check if input is input_ids and padded -> only then is attention_mask defined
-#        if is_mel_spec:
-#            return inputs.ne(pad_token_id).long()
-#        else:
-#            return None
-
-#    def _prepare_decoder_input_ids_for_generation(
-#        self,
-#        batch_size: int,
-#        decoder_start_token_id: int = None,
-#        bos_token_id: int = None,
-#        model_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-#        device: torch.device = None,
-#    ) -> torch.LongTensor:
-#
-#        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
-#            return model_kwargs.pop("decoder_input_ids")
-#        else:
-#            decoder_start_token_id = list(self.config.decoder_start_token_id)
-#            if device is None:
-#                device = self.device
-#            return torch.tensor(batch_size * [decoder_start_token_id], dtype=torch.long, device=device)
 #
     @staticmethod
     def _reorder_cache(past, beam_idx):

From b4c0cb9182b32a5a5ee2918ac63bb13f06d5fbc8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 09:32:39 +0000
Subject: [PATCH 058/156] remove uncoder attention mask useless

---
 .../models/whisper/modeling_whisper.py          | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 467ee6d62f238..9f840979f62b3 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -17,7 +17,7 @@
 
 import math
 import random
-from typing import Dict, Optional, Tuple
+from typing import Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -785,7 +785,7 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
 
         if attention_mask is not None:
             if attention_mask.shape[-1] > input_shape[-1] > 0:
-                attention_mask = attention_mask[:, : input_shape[-1]+past_key_values_length]
+                attention_mask = attention_mask[:, : input_shape[-1] + past_key_values_length]
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
             expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
             combined_attention_mask = (
@@ -1284,12 +1284,7 @@ def forward(
         )
 
     def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs
+        self, decoder_input_ids, past=None, use_cache=None, encoder_outputs=None, **kwargs
     ):
         # cut decoder_input_ids if past is used
         if past is not None:
@@ -1300,11 +1295,11 @@ def prepare_inputs_for_generation(
             "past_key_values": past,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": None,
-            "encoder_attention_mask":None,
             "use_cache": use_cache,
-            "decoder_attention_mask":None
+            "decoder_attention_mask": None,
         }
-#
+
+    #
     @staticmethod
     def _reorder_cache(past, beam_idx):
         reordered_past = ()

From f6b7550b8e7fd183d5b3651766e5fc953612d3f4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 17:05:41 +0000
Subject: [PATCH 059/156] update test modeling whisper

---
 tests/models/whisper/test_modeling_whisper.py | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index a5855a5d02dda..a2457ff50ee10 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -340,7 +340,7 @@ def test_training_gradient_checkpointing(self):
 
     def test_generate_with_head_masking(self):
         pass
-    
+
     def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         config.max_target_positions = 400
@@ -902,6 +902,7 @@ def test_generation_en_only(self):
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         model.to(torch_device)
+        model.config.decoder_start_token_id = 50257
 
         input_speech = self._load_datasamples(1)
         feaure_extractor = WhisperFeatureExtractor()
@@ -909,16 +910,16 @@ def test_generation_en_only(self):
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
-        generated_ids = model.generate(input_features, num_beams=5)
+        generated_ids = model.generate(input_features, num_beams=5, forced_bos_token_id=50362)
         transcript = tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
-            " classes and we are glad"
+            "<|startoftranscript|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            " classes, and we are glad to"
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-    @slow
+    # @slow
     def test_generation(self):
 
         torch_device = "cpu"
@@ -933,7 +934,9 @@ def test_generation(self):
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
-        generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids=torch.tensor([[50257, 50258, 50362]]))
+        generated_ids = model.generate(
+            input_features, num_beams=5, decoder_input_ids=torch.tensor([[50258, 50259]])
+        )
         transcript = tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = (
@@ -944,7 +947,7 @@ def test_generation(self):
 
     @slow
     def test_large_generation(self):
-
+        # TODO last remaining test, it does not work 
         torch_device = "cpu"
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
@@ -959,23 +962,22 @@ def test_large_generation(self):
 
         logits_processor = LogitsProcessorList(
             [
-                SuppressBlank(tokenizer.encode(" "), tokenizer.eos_token_id),
+                SuppressBlank(tokenizer.encode(" "), 50256),
                 SuppressTokens(tokenizer._get_suppress_tokens("-1")),
             ]
         )
 
+        decoder_input_ids = torch.tensor([[50258,50259]]).long()
         generated_ids = model.generate(
             input_features,
             do_sample=False,
             logits_processor=logits_processor,
-            decoder_input_ids=torch.tensor([[50257, 50362]]),
-            attention_mask=None,
-            decoder_attention_mask=None,
+            decoder_input_ids=decoder_input_ids,
         )
         transcript = tokenizer.batch_decode(generated_ids)
 
         EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            "<|startoftranscript|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
             " classes and we are glad"
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)

From 8c96dfd9e63ee12872c2ac5654a7d93feaeb009b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 17:05:57 +0000
Subject: [PATCH 060/156] update condfig to add second non supress tokens

---
 .../models/whisper/configuration_whisper.py        | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 319115fb1f5ad..971c8abba5b85 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -26,6 +26,17 @@
 
 # fmt: off
 NON_SPEECH_TOKENS = [
+        1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
+        26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+        63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
+        705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
+        1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
+        4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
+        11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
+        17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
+        34949, 40283, 40493, 40549, 47282, 49146
+]
+NON_SPEECH_TOKENS_MULTI = [
     1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
     26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
     63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
@@ -38,7 +49,6 @@
 ]
 # fmt: on
 
-
 class WhisperConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate an
@@ -139,7 +149,7 @@ def __init__(
         encoder_ffn_dim=1536,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
-        decoder_start_token_id=[50258, 50259, 50359],
+        decoder_start_token_id=50258,
         use_cache=True,
         is_encoder_decoder=True,
         activation_function="gelu",

From 378841c06bf1407a49ae12734d30b08b530a5c1c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 17:06:16 +0000
Subject: [PATCH 061/156] nits on feature exrtactor

---
 .../whisper/feature_extraction_whisper.py     | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index b1486af8f90ef..94b74e9b514d7 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -30,7 +30,6 @@
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
 
-
 logger = logging.get_logger(__name__)
 
 
@@ -67,7 +66,6 @@ def __init__(
         self,
         feature_size=80,
         sampling_rate=16000,
-        num_mel_bins=80,
         hop_length=160,
         chunk_length=30,
         n_fft=400,
@@ -75,7 +73,6 @@ def __init__(
         **kwargs
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
-        self.num_mel_bins = num_mel_bins
         self.n_fft = n_fft
         self.hop_length = hop_length
         self.chunk_length = chunk_length
@@ -83,7 +80,7 @@ def __init__(
         self.n_samples = chunk_length * sampling_rate
         self.nb_max_frame = self.n_samples // hop_length
         self.sampling_rate = sampling_rate
-        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=num_mel_bins)
+        self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
 
     def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
         # Initialize the weights
@@ -207,18 +204,6 @@ def __call__(
                 The value that is used to fill the padding values / vectors.
         """
 
-        if sampling_rate is not None:
-            if sampling_rate != self.sampling_rate:
-                raise ValueError(
-                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
-                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
-                    f" {self.sampling_rate} and not {sampling_rate}."
-                )
-        else:
-            logger.warning(
-                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
-                "Failing to do so can result in silent errors that might be hard to debug."
-            )
 
         is_batched = bool(
             isinstance(raw_speech, (list, tuple))
@@ -251,7 +236,7 @@ def __call__(
         )
         # make sure list is in array format
         input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
-            
+
         input_features = [self._extract_fbank_features(waveform) for waveform in input_features[0]]
 
         if isinstance(input_features[0], torch.Tensor) or isinstance(input_features[0], List):

From 3a2c411e4cf8361ca99a7eed5871d869eb327ba7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 17:06:36 +0000
Subject: [PATCH 062/156] nit for test tokenizers

---
 .../models/whisper/english_normalizer.py      | 23 +++++--------------
 .../models/whisper/tokenization_whisper.py    |  6 ++++-
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index 3a6626d028532..d3abd03975c36 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -66,8 +66,7 @@ def __init__(self):
             )
         }
         self.ones_plural = {
-            "sixes" if name == "six" else name + "s": (value, "s")
-            for name, value in self.ones.items()
+            "sixes" if name == "six" else name + "s": (value, "s") for name, value in self.ones.items()
         }
         self.ones_ordinal = {
             "zeroth": (0, "th"),
@@ -94,12 +93,8 @@ def __init__(self):
             "eighty": 80,
             "ninety": 90,
         }
-        self.tens_plural = {
-            name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
-        }
-        self.tens_ordinal = {
-            name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()
-        }
+        self.tens_plural = {name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()}
+        self.tens_ordinal = {name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()}
         self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
 
         self.multipliers = {
@@ -116,12 +111,8 @@ def __init__(self):
             "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
             "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
         }
-        self.multipliers_plural = {
-            name + "s": (value, "s") for name, value in self.multipliers.items()
-        }
-        self.multipliers_ordinal = {
-            name + "th": (value, "th") for name, value in self.multipliers.items()
-        }
+        self.multipliers_plural = {name + "s": (value, "s") for name, value in self.multipliers.items()}
+        self.multipliers_ordinal = {name + "th": (value, "th") for name, value in self.multipliers.items()}
         self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal}
         self.decimals = {*self.ones, *self.tens, *self.zeros}
 
@@ -141,9 +132,7 @@ def __init__(self):
             "cent": "¢",
             "cents": "¢",
         }
-        self.prefixes = set(
-            list(self.preceding_prefixers.values()) + list(self.following_prefixers.values())
-        )
+        self.prefixes = set(list(self.preceding_prefixers.values()) + list(self.following_prefixers.values()))
         self.suffixers = {
             "per": {"cent": "%"},
             "percent": "%",
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 829bdaf244868..ed50e76ee6550 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -285,6 +285,11 @@ def __init__(
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+        
     @property
     @lru_cache()
     def sot_sequence(self) -> Tuple[int]:
@@ -367,7 +372,6 @@ def sot_sequence_including_notimestamps(self) -> Tuple[int]:
     def vocab_size(self) -> int:
         return len(self.encoder)
 
-
     def bpe(self, token):
         if token in self.cache:
             return self.cache[token]

From 4dfbba1ce2393449110ad8e99fc60c0b9c57b683 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 27 Sep 2022 17:06:51 +0000
Subject: [PATCH 063/156] update etsts

---
 .../test_feature_extraction_whisper.py        | 195 ++++++++----------
 .../models/whisper/test_processor_whisper.py  |   5 +-
 .../whisper/test_tokenization_whisper.py      |   4 +-
 3 files changed, 91 insertions(+), 113 deletions(-)

diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index 1c16b348adadb..aa5878f43a183 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -22,13 +22,20 @@
 
 from transformers import is_speech_available
 from transformers.testing_utils import require_torch, require_torchaudio
+from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+from transformers.testing_utils import check_json_file_has_correct_format
 
+import tempfile
+import os
 
 if is_speech_available():
     from transformers import WhisperFeatureExtractor
 
+if is_torch_available():
+    import torch
+
 global_rng = random.Random()
 
 
@@ -55,29 +62,33 @@ def __init__(
         batch_size=7,
         min_seq_length=400,
         max_seq_length=2000,
-        feature_size=24,
-        num_mel_bins=24,
+        feature_size=10,
+        hop_length=160,
+        chunk_length=5,
         padding_value=0.0,
-        sampling_rate=16_000,
+        sampling_rate=4_000,
         return_attention_mask=True,
         do_normalize=True,
+
     ):
         self.parent = parent
         self.batch_size = batch_size
         self.min_seq_length = min_seq_length
         self.max_seq_length = max_seq_length
         self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
-        self.feature_size = feature_size
-        self.num_mel_bins = num_mel_bins
         self.padding_value = padding_value
         self.sampling_rate = sampling_rate
         self.return_attention_mask = return_attention_mask
         self.do_normalize = do_normalize
+        self.feature_size = feature_size
+        self.chunk_length = chunk_length
+        self.hop_length = hop_length
 
     def prepare_feat_extract_dict(self):
         return {
-            "feature_size": self.feature_size,
-            "num_mel_bins": self.num_mel_bins,
+            "feature_size":self.feature_size,
+            "hop_length": self.hop_length,
+            "chunk_length":self.chunk_length,
             "padding_value": self.padding_value,
             "sampling_rate": self.sampling_rate,
             "return_attention_mask": self.return_attention_mask,
@@ -110,9 +121,35 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
     def setUp(self):
         self.feat_extract_tester = WhisperFeatureExtractionTester(self)
 
-    def _check_zero_mean_unit_variance(self, input_vector):
-        self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
-        self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = dict_first.pop("mel_filters")
+        mel_2 = dict_second.pop("mel_filters")
+        self.assertTrue(np.allclose(mel_1,mel_2))
+        self.assertEqual(dict_first, dict_second)
+    
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = dict_first.pop("mel_filters")
+        mel_2 = dict_second.pop("mel_filters")
+        self.assertTrue(np.allclose(mel_1,mel_2))
+        self.assertEqual(dict_first, dict_second)
 
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
@@ -122,9 +159,10 @@ def test_call(self):
         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
 
         # Test feature size
-        input_features = feature_extractor(np_speech_inputs, padding=True, return_tensors="np").input_features
+        input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
         self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size)
+        self.assertTrue(input_features.shape[-1] == feature_extractor.nb_max_frame)
+        self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
 
         # Test not batched input
         encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
@@ -136,105 +174,18 @@ def test_call(self):
         encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+        
+        # Test truncation required
+        speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples+500), 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
 
-    def test_cepstral_mean_and_variance_normalization(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_np(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-
-        paddings = ["longest", "max_length", "do_not_pad"]
-        max_lengths = [None, 16, None]
-        for max_length, padding in zip(max_lengths, paddings):
-            inputs = feature_extractor(
-                speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True
-            )
-            input_features = inputs.input_features
-            attention_mask = inputs.attention_mask
-            fbank_feat_lengths = [np.sum(x) for x in attention_mask]
-
-            self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
-            self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6)
-            self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_max_length(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="max_length",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-    def test_cepstral_mean_and_variance_normalization_trunc_longest(self):
-        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=4,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
-
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 4, 24))
-
-        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
-        inputs = feature_extractor(
-            speech_inputs,
-            padding="longest",
-            max_length=16,
-            truncation=True,
-            return_tensors="np",
-            return_attention_mask=True,
-        )
-        input_features = inputs.input_features
-        attention_mask = inputs.attention_mask
-        fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
-
-        self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
-        self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
-        self._check_zero_mean_unit_variance(input_features[2])
+        speech_inputs_truncated = [ x[:feature_extractor.n_samples] for x in speech_inputs]
+        np_speech_inputs_truncated  = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
 
-        # make sure that if max_length < longest -> then pad to max_length
-        self.assertEqual(input_features.shape, (3, 6, 24))
+        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
 
     def test_double_precision_pad(self):
         import torch
@@ -248,3 +199,29 @@ def test_double_precision_pad(self):
             self.assertTrue(np_processed.input_features.dtype == np.float32)
             pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
             self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+    
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples]
+
+    def test_integration(self): 
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [   
+                0.1193, -0.0946, -0.1098, -0.0196,  0.0225, -0.0690, -0.1736,  0.0951,
+                0.0971, -0.0817, -0.0702,  0.0162,  0.0260,  0.0017, -0.0192, -0.1678,
+                0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
+                -0.0274, -0.1425, -0.1423,  0.0837,  0.0377, -0.0854
+            ]
+        )
+        # fmt: on
+
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = WhisperFeatureExtractor()
+        input_features = feaure_extractor(input_speech,return_tensors="pt").input_features
+        self.assertTrue(torch.allclose(input_features[0,0,:30],EXPECTED_INPUT_FEATURES, atol = 1e-4))
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
index e969c0f331474..78ca77a71e9f8 100644
--- a/tests/models/whisper/test_processor_whisper.py
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -19,7 +19,7 @@
 from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
 
 from .test_feature_extraction_whisper import floats_list
-
+import tempfile
 
 if is_speech_available():
     from transformers import WhisperFeatureExtractor, WhisperProcessor
@@ -30,7 +30,8 @@
 @require_sentencepiece
 class WhisperProcessorTest(unittest.TestCase):
     def setUp(self):
-        self.checkpoint = "ArthurZ/whisper-small.eng"
+        self.checkpoint = "openai/whisper-small.en"
+        self.tmpdirname = tempfile.mkdtemp()
 
     def get_tokenizer(self, **kwargs):
         return WhisperTokenizer.from_pretrained(self.checkpoint, **kwargs)
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index c5a8f58495404..db5fc9a10ac60 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -115,7 +115,7 @@ def setUpClass(cls):
 
     def test_tokenizer_equivalence(self):
         text = "다람쥐 헌 쳇바퀴에 타고파"
-        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny",language="ko")
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="ko")
         gpt2_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
 
         gpt2_tokens = gpt2_tokenizer.encode(text)
@@ -159,7 +159,7 @@ def test_tokenizer_special(self):
 
         self.assertEqual(text, multilingual_tokenizer.decode(multilingual_tokens))
 
-        transcript =  multilingual_tokenizer.decode(multilingual_tokens,skip_special_tokens=True)
+        transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens = True)
 
         EXPECTED_JAP = "  Hey! How are you feeling? J'ai l'impression que 郷さん est prêt "
         self.assertListEqual(transcript, EXPECTED_JAP)

From 0a39f4918e97610864293d7661c3636130a4b355 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 09:55:19 +0000
Subject: [PATCH 064/156] update tests

---
 .../whisper/feature_extraction_whisper.py     |   1 +
 .../test_feature_extraction_whisper.py        |   7 +-
 tests/models/whisper/test_modeling_whisper.py | 124 ++++++++----------
 .../models/whisper/test_processor_whisper.py  |   3 +-
 .../whisper/test_tokenization_whisper.py      |  53 ++++----
 5 files changed, 84 insertions(+), 104 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 94b74e9b514d7..918fca931a102 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -30,6 +30,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
 
+
 logger = logging.get_logger(__name__)
 
 
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index aa5878f43a183..300b124d27fcb 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -15,20 +15,19 @@
 
 
 import itertools
+import os
 import random
+import tempfile
 import unittest
 
 import numpy as np
 
 from transformers import is_speech_available
-from transformers.testing_utils import require_torch, require_torchaudio
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torchaudio
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
-from transformers.testing_utils import check_json_file_has_correct_format
 
-import tempfile
-import os
 
 if is_speech_available():
     from transformers import WhisperFeatureExtractor
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index a2457ff50ee10..14c6a10f245c6 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -761,14 +761,14 @@ def _load_datasamples(self, num_samples):
 
         return [x["array"] for x in speech_samples]
 
-    def test_inference_no_head(self):
+    def test_inference_tiny(self):
         torch_device = "cpu"
         set_seed(0)
         model = WhisperModel.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
         input_speech = self._load_datasamples(1)
         feature_extractor = WhisperFeatureExtractor()
-        input_features = feature_extractor(2 * input_speech, return_tensors="pt").input_features
+        input_features = feature_extractor(input_speech, return_tensors="pt").input_features
 
         with torch.no_grad():
             logits = model(
@@ -783,10 +783,10 @@ def test_inference_no_head(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.9892, -6.7607, 5.7348, 3.6095, 0.2152, -5.7321, 4.8855, -1.6407,
-                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
-                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3714,
-                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
+                2.9892, -6.7607,  5.7348,  3.6095,  0.2152, -5.7321,  4.8855, -1.6407,
+                0.2823, -1.5718, 10.4269,  3.4427,  0.0219, -8.0612,  3.4784,  8.4246,
+                4.0575, -2.2864, 11.1084,  0.9963,  0.9884, -8.5154, -3.5469, -9.3714,
+                0.9786,  3.5435,  7.4850, -5.2579, -1.4366, 10.4841
             ]
         )
         # fmt: on
@@ -795,10 +795,10 @@ def test_inference_no_head(self):
         # fmt: off
         EXPECTED_GENERATION = torch.tensor(
             [
-                -1.4651, -2.6944, 2.7821, 2.3793, 4.0738, 0.0188, -3.3204, 1.9836,
-                0.0520, 0.7095, 1.1063, 0.2952, -3.6786, -0.5249, 0.3105, 4.7691,
-                1.1562, 1.3046, 0.5810, -0.3624, 1.7006, 1.3424, 0.9817, 2.1958,
-                1.8775, -5.7046, -0.7679, 4.0113, 2.6848, 2.8609
+                -1.4651, -2.6944,  2.7821,  2.3793,  4.0738,  0.0188, -3.3204,  1.9836,
+                0.0520,  0.7095,  1.1063,  0.2952, -3.6786, -0.5249,  0.3105,  4.7691,
+                1.1562,  1.3046,  0.5810, -0.3624,  1.7006,  1.3424,  0.9817,  2.1958,
+                1.8775, -5.7046, -0.7679,  4.0113,  2.6848,  2.8609
             ]
         )
         # fmt: on
@@ -807,42 +807,34 @@ def test_inference_no_head(self):
         self.assertTrue(torch.allclose(head_logits[0, 0, :30].cpu(), EXPECTED_GENERATION, atol=1e-4))
 
     def test_small_logits_librispeech(self):
-
-        torch_device = "cpu"
         set_seed(0)
+        torch_device = "cpu"
         model = WhisperModel.from_pretrained("openai/whisper-small.en")
         model.to(torch_device)
 
-        # processor = self.default_processor
-
         input_speech = self._load_datasamples(1)
 
         feaure_extractor = WhisperFeatureExtractor()
-        tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
-        tokenizer.pad_token = 0
-
-        processor = WhisperProcessor(feaure_extractor, tokenizer)
+        input_features = feaure_extractor(input_speech, return_tensors="pt").input_features.to(torch_device)
 
-        input_features = processor(audio=input_speech, return_tensors="pt").input_features.to(torch_device)
-
-        with torch.no_grad():
-            logits = model(
-                input_features,
-                decoder_input_ids=torch.tensor([model.config.decoder_start_token_id]),
-                output_hidden_states=False,
-                output_attentions=False,
-                use_cache=False,
-            )
+        logits = model(
+            input_features,
+            decoder_input_ids=torch.tensor([[model.config.decoder_start_token_id]]),
+            output_hidden_states=False,
+            output_attentions=False,
+            use_cache=False,
+        )
 
         logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
 
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                -3.5023, -5.8727, -7.1252, -8.5208, -6.0207, -7.8296, -4.6376, -6.3990,
-                -3.7516, -4.0411, -6.8055, -3.7937, -6.8897, -3.4925, -5.4489, -5.2272,
-                -5.7970, -6.8300, -6.4165, -6.9162, -7.2233, -8.0165, -6.9419, -8.7574,
-                -6.9695, -5.8984, -4.6315, -8.2338, -8.9415, -5.8150
+                -3.6784,  -7.7212,  -9.5070, -11.9286,  -7.6489,  -9.7026,  -5.6188,
+                -8.0104,  -4.6239,  -5.1833,  -9.0485,  -3.4079,  -5.4874,  -2.6935,
+                -6.3479,  -7.3398,  -6.9558,  -7.6867,  -7.4748,  -8.3463,  -9.9781,
+                -10.8389, -10.3105, -11.7201,  -9.7261,  -7.1590,  -5.9272, -12.4509,
+                -11.1147,  -8.1918
             ]
         )
         # fmt: on
@@ -850,46 +842,37 @@ def test_small_logits_librispeech(self):
 
     @slow
     def test_large_logits_librispeech(self):
+        set_seed(0)
 
         torch_device = "cpu"
         model = WhisperModel.from_pretrained("openai/whisper-large")
-        set_seed(0)
         model.to(torch_device)
 
-        # processor = self.default_processor
-
         input_speech = self._load_datasamples(1)
 
-        feaure_extractor = WhisperFeatureExtractor()
-        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
-        tokenizer.pad_token = 0
-
-        processor = WhisperProcessor(feaure_extractor, tokenizer)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
+        processed_inputs = processor(audio=input_speech, text="This part of the speech", return_tensors="pt")
+        input_features = processed_inputs.input_features.to(torch_device)
+        labels = processed_inputs.labels.to(torch_device)
 
-        input_features = processor(
-            audio=input_speech, text="This part of the speech", return_tensors="pt"
-        ).input_features.to(torch_device)
-        labels = processor(audio=input_speech, text="This part of the speech", return_tensors="pt").labels.to(
-            torch_device
+       
+        logits = model(
+            input_features,
+            decoder_input_ids=labels,
+            output_hidden_states=False,
+            output_attentions=False,
+            use_cache=False,
         )
-        with torch.no_grad():
-            logits = model(
-                input_features,
-                decoder_input_ids=labels,
-                output_hidden_states=False,
-                output_attentions=False,
-                use_cache=False,
-            )
 
         logits = logits.last_hidden_state @ model.decoder.embed_tokens.weight.T
 
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.1417, 0.9379, 4.4650, 3.5576, 2.4032, 3.8589, -0.6490, 2.5477,
-                1.8330, 1.9925, 2.3441, 1.4747, 0.5453, 2.2641, 1.5200, 2.5393,
-                1.1657, 0.6221, 1.0749, 1.8284, 2.4085, 1.6626, 2.3525, 1.3372,
-                1.9910, 1.8686, 3.8962, 5.3653, 4.4751, 3.9166
+                2.1382,  0.9381,  4.4671,  3.5589,  2.4022,  3.8577, -0.6521,  2.5472,
+                1.8301,  1.9957,  2.3432,  1.4678,  0.5459,  2.2597,  1.5179,  2.5357,
+                1.1624,  0.6194,  1.0757,  1.8259,  2.4076,  1.6601,  2.3503,  1.3376,
+                1.9891,  1.8635,  3.8931,  5.3699,  4.4772,  3.9184
             ]
         )
         # fmt: on
@@ -919,25 +902,26 @@ def test_generation_en_only(self):
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-    # @slow
+    @slow
     def test_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
-        model.config.decoder_input_ids = [50258, 50259, 50359, 50363]
 
         input_speech = self._load_datasamples(1)
         feaure_extractor = WhisperFeatureExtractor()
 
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
-        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
+
+        decoder_input_ids = torch.tensor([[50258]]).long()
         generated_ids = model.generate(
-            input_features, num_beams=5, decoder_input_ids=torch.tensor([[50258, 50259]])
+            input_features, num_beams=5, decoder_input_ids=decoder_input_ids
         )
-        transcript = tokenizer.batch_decode(generated_ids)[0]
+        transcript = tokenizer.decode(generated_ids[0])
 
         EXPECTED_TRANSCRIPT = (
             "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
@@ -947,7 +931,6 @@ def test_generation(self):
 
     @slow
     def test_large_generation(self):
-        # TODO last remaining test, it does not work 
         torch_device = "cpu"
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
@@ -958,7 +941,7 @@ def test_large_generation(self):
 
         input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
 
-        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small.en")
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
 
         logits_processor = LogitsProcessorList(
             [
@@ -966,18 +949,19 @@ def test_large_generation(self):
                 SuppressTokens(tokenizer._get_suppress_tokens("-1")),
             ]
         )
+        tokenizer.eos_token_id = 50257
+        tokenizer.eos_token = "<|endoftext|>"
+        model.config.eos_token_id = 50257
+        model.config.decoder_start_token_id = 50258
 
-        decoder_input_ids = torch.tensor([[50258,50259]]).long()
+        decoder_input_ids = torch.tensor([[50258]]).long()
         generated_ids = model.generate(
             input_features,
             do_sample=False,
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids)
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
-            " classes and we are glad"
-        )
+        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we're glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
diff --git a/tests/models/whisper/test_processor_whisper.py b/tests/models/whisper/test_processor_whisper.py
index 78ca77a71e9f8..00a5995f003da 100644
--- a/tests/models/whisper/test_processor_whisper.py
+++ b/tests/models/whisper/test_processor_whisper.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 import shutil
+import tempfile
 import unittest
 
 from transformers import WhisperTokenizer, is_speech_available
 from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
 
 from .test_feature_extraction_whisper import floats_list
-import tempfile
+
 
 if is_speech_available():
     from transformers import WhisperFeatureExtractor, WhisperProcessor
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index db5fc9a10ac60..18db3217de5ba 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -30,7 +30,7 @@
 class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = WhisperTokenizer
     test_rust_tokenizer = False
-    test_sentencepiece = True
+    test_sentencepiece = False
 
     def setUp(self):
         super().setUp()
@@ -39,8 +39,8 @@ def setUp(self):
 
     def test_convert_token_and_id(self):
         """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = "<pad>"
-        token_id = 1
+        token = "Where"
+        token_id = 14436
 
         self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
         self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
@@ -48,53 +48,53 @@ def test_convert_token_and_id(self):
     def test_get_vocab(self):
         vocab_keys = list(self.get_tokenizer().get_vocab().keys())
 
-        self.assertEqual(vocab_keys[0], "<s>")
-        self.assertEqual(vocab_keys[1], "<pad>")
-        self.assertEqual(vocab_keys[-1], "j")
-        self.assertEqual(len(vocab_keys), 1_001)
+        self.assertEqual(vocab_keys[0], "!")
+        self.assertEqual(vocab_keys[1], "\"")
+        self.assertEqual(vocab_keys[-1], "<|notimestamps|>")
+        self.assertEqual(len(vocab_keys), 50364)
 
     def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_001)
+        self.assertEqual(self.get_tokenizer().vocab_size, 50257)
 
     def test_full_tokenizer(self):
         tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
 
         tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+        self.assertListEqual(tokens, ['This', 'Ġis', 'Ġa', 'Ġ', 'test'])
 
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens),
-            [289, 50, 14, 174, 386],
+            [5723, 307, 257, 220, 31636],
         )
 
         tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
         self.assertListEqual(
             tokens,
             # fmt: off
-            [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", "."],
+            ['I', 'Ġwas','Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'Ã©', '.' ],
             # fmt: on
         )
         ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [12, 25, 88, 59, 28, 23, 11, 4, 606, 351, 351, 351, 7, 16, 70, 50, 76, 84, 10, 4, 8])
+        self.assertListEqual(ids, [40, 373, 4642, 287, 10190, 830, 11, 290, 428, 318, 27807, 2634, 13])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
         self.assertListEqual(
             back_tokens,
             # fmt: off
-            [SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", "."],
+            ['I', 'Ġwas','Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'Ã©', '.' ],
             # fmt: on
         )
 
     @slow
     def test_tokenizer_integration(self):
         # fmt: off
-        expected_encoding = {'input_ids': [[3791, 797, 31, 11, 64, 797, 31, 2429, 433, 12, 1176, 12, 20, 786, 915, 142, 2413, 240, 37, 3238, 797, 31, 11, 35, 93, 915, 142, 2413, 240, 37, 5540, 567, 1276, 93, 37, 610, 40, 62, 455, 657, 1042, 123, 780, 177, 37, 309, 241, 1298, 514, 20, 292, 2737, 114, 2469, 241, 85, 64, 302, 548, 528, 423, 4, 509, 406, 423, 37, 601, 4, 777, 302, 548, 528, 423, 284, 4, 3388, 511, 459, 4, 3555, 40, 321, 302, 705, 4, 3388, 511, 583, 326, 5, 5, 5, 62, 3310, 560, 177, 2680, 217, 1508, 32, 31, 853, 418, 64, 583, 511, 1605, 62, 35, 93, 560, 177, 2680, 217, 1508, 1521, 64, 583, 511, 519, 62, 20, 1515, 764, 20, 149, 261, 5625, 7972, 20, 5540, 567, 1276, 93, 3925, 1675, 11, 15, 802, 7972, 576, 217, 1508, 11, 35, 93, 1253, 2441, 15, 289, 652, 31, 416, 321, 3842, 115, 40, 911, 8, 476, 619, 4, 380, 142, 423, 335, 240, 35, 93, 264, 8, 11, 335, 569, 420, 163, 5, 2], [260, 548, 528, 423, 20, 451, 20, 2681, 1153, 3434, 20, 5540, 37, 567, 126, 1253, 2441, 3376, 449, 210, 431, 1563, 177, 767, 5540, 11, 1203, 472, 11, 2953, 685, 285, 364, 706, 1153, 20, 6799, 20, 2869, 20, 4464, 126, 40, 2429, 20, 1040, 866, 2664, 418, 20, 318, 20, 1726, 186, 20, 265, 522, 35, 93, 2191, 4634, 20, 1040, 12, 6799, 15, 228, 2356, 142, 31, 11, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [2575, 2666, 684, 1582, 1176, 12, 627, 149, 619, 20, 4902, 563, 11, 20, 149, 261, 3420, 2356, 174, 142, 4714, 131, 5, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # noqa: E501
+        expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
         # fmt: on
 
         self.tokenizer_integration_test_util(
             expected_encoding=expected_encoding,
-            model_name="opneai/whispoer-tiny",
-            revision="a14f04cf0776c02f62a8cb800cf7909e15ea23ad",
+            model_name="openai/whisper-tiny.en",
+            padding=False
         )
 
 
@@ -107,7 +107,7 @@ class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
         " than his matter.<|endoftext|>'"
     )
     clean_transcript = "  Nor is Mr. Quilters manner less interesting than his matter."
-
+    french_text = "Bonjour! Il me semble que Mrs Quilters n'était pas présente"
     @classmethod
     def setUpClass(cls):
         cls.tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(cls.checkpoint_name)
@@ -143,15 +143,15 @@ def test_tokenizer_equivalence(self):
 
     def test_tokenizer_special(self):
         multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
-        text = "<|startoftranscript|> Hey! How are you feeling? J'ai l'impression que 郷さん est prêt <|endoftext|>"
+        text = "<|startoftranscript|> Hey! How are you feeling? J'ai l'impression que 郷さん est prêt<|endoftext|>"
 
         multilingual_tokens = multilingual_tokenizer.encode(text)
 
         # fmt: off
         EXPECTED_MULTI = [
-            50258, 1911, 0, 1012, 366, 291, 2633, 30, 508, 6,
-            1301, 287, 6, 36107, 631, 220, 11178, 115, 15567, 871,
-            44393, 220, 50257
+            50257, 10814,     0,  1374,   389,   345,  4203,    30,   449,     6,
+            1872,   300,     6, 11011,  2234,  8358, 16268,   225,   115, 43357,
+            22174,  1556,   778, 25792,    83, 50256
         ]
         # fmt: on
 
@@ -161,7 +161,7 @@ def test_tokenizer_special(self):
 
         transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens = True)
 
-        EXPECTED_JAP = "  Hey! How are you feeling? J'ai l'impression que 郷さん est prêt "
+        EXPECTED_JAP = ["Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"]
         self.assertListEqual(transcript, EXPECTED_JAP)
 
     def test_vocab_size(self):
@@ -179,11 +179,6 @@ def test_tokenizer_adds_special_tokens(self):
         self.tokenizer.language_token = "fr"
         encoded = self.tokenizer(self.french_text).input_ids
         self.assertEqual(encoded[0], EN_CODE)
-        self.assertEqual(encoded[-1], self.tokenizer.eos_token_id)
+        # 20682 != 50258
 
-    def test_language_token_setter(self):
-        self.tokenizer.language_token = "en"
-        self.assertListEqual(self.tokenizer.prefix_tokens, [EN_CODE])
-
-        self.tokenizer.language_token = "es"
-        self.assertListEqual(self.tokenizer.prefix_tokens, [ES_CODE])
+        self.assertEqual(encoded[-1], self.tokenizer.eos_token_id)

From 1fd1d52ab2974325a6f902ea56f95b7d3ed5e07f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 10:15:26 +0000
Subject: [PATCH 065/156] update tokenization test

---
 .../whisper/test_tokenization_whisper.py      | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 18db3217de5ba..beebdb310080d 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -35,6 +35,8 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     def setUp(self):
         super().setUp()
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
+        tokenizer.pad_token_id=50256
+        tokenizer.pad_token="<|endoftext|>"
         tokenizer.save_pretrained(self.tmpdirname)
 
     def test_convert_token_and_id(self):
@@ -71,20 +73,23 @@ def test_full_tokenizer(self):
         self.assertListEqual(
             tokens,
             # fmt: off
-            ['I', 'Ġwas','Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'Ã©', '.' ],
+            ['I', 'Ġwas','Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġ', 'this', 'Ġis', 'Ġfals', 'Ã©', '.' ],
             # fmt: on
         )
         ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [40, 373, 4642, 287, 10190, 830, 11, 290, 428, 318, 27807, 2634, 13])
+        self.assertListEqual(ids, [40, 390, 4232, 294, 1722, 25743, 11, 293, 220, 11176, 307, 16720, 526, 13])
 
         back_tokens = tokenizer.convert_ids_to_tokens(ids)
         self.assertListEqual(
             back_tokens,
             # fmt: off
-            ['I', 'Ġwas','Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'Ã©', '.' ],
+            ['I', 'Ġwas','Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġ', 'this', 'Ġis', 'Ġfals', 'Ã©', '.' ],
             # fmt: on
         )
 
+    def test_tokenizer_slow_store_full_signature(self):
+        pass
+
     @slow
     def test_tokenizer_integration(self):
         # fmt: off
@@ -161,8 +166,8 @@ def test_tokenizer_special(self):
 
         transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens = True)
 
-        EXPECTED_JAP = ["Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"]
-        self.assertListEqual(transcript, EXPECTED_JAP)
+        EXPECTED_JAP = "Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
+        self.assertEqual(transcript, EXPECTED_JAP)
 
     def test_vocab_size(self):
         self.assertEqual(self.tokenizer.vocab_size, 50257)
@@ -173,12 +178,4 @@ def test_tokenizer_decode_ignores_language_codes(self):
         result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
         expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
         self.assertEqual(result, expected_spanish)
-        self.assertNotIn(self.tokenizer.eos_token, result)
-
-    def test_tokenizer_adds_special_tokens(self):
-        self.tokenizer.language_token = "fr"
-        encoded = self.tokenizer(self.french_text).input_ids
-        self.assertEqual(encoded[0], EN_CODE)
-        # 20682 != 50258
-
-        self.assertEqual(encoded[-1], self.tokenizer.eos_token_id)
+        self.assertNotIn(self.tokenizer.eos_token, result)
\ No newline at end of file

From 2a900f484fbf8a51fc6365a0bc1613d13ac1bcf5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 10:50:50 +0000
Subject: [PATCH 066/156] fixup

---
 .../models/whisper/configuration_whisper.py   | 19 ++++----
 .../whisper/feature_extraction_whisper.py     |  1 -
 .../models/whisper/tokenization_whisper.py    |  2 +-
 .../test_feature_extraction_whisper.py        | 35 +++++++-------
 tests/models/whisper/test_modeling_whisper.py | 47 +++++++++----------
 .../whisper/test_tokenization_whisper.py      | 32 ++++++-------
 6 files changed, 65 insertions(+), 71 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 971c8abba5b85..ad90e3a1bff5d 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -26,15 +26,15 @@
 
 # fmt: off
 NON_SPEECH_TOKENS = [
-        1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
-        26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
-        63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
-        705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
-        1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
-        4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
-        11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
-        17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
-        34949, 40283, 40493, 40549, 47282, 49146
+    1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
+    705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
+    1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
+    4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
+    11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
+    17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
+    34949, 40283, 40493, 40549, 47282, 49146
 ]
 NON_SPEECH_TOKENS_MULTI = [
     1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
@@ -49,6 +49,7 @@
 ]
 # fmt: on
 
+
 class WhisperConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate an
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 918fca931a102..586be7a2c1955 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -205,7 +205,6 @@ def __call__(
                 The value that is used to fill the padding values / vectors.
         """
 
-
         is_batched = bool(
             isinstance(raw_speech, (list, tuple))
             and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index ed50e76ee6550..d8b2bfadb4948 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -289,7 +289,7 @@ def get_vocab(self):
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
-        
+
     @property
     @lru_cache()
     def sot_sequence(self) -> Tuple[int]:
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index 300b124d27fcb..67ba729cae8c5 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -68,7 +68,6 @@ def __init__(
         sampling_rate=4_000,
         return_attention_mask=True,
         do_normalize=True,
-
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -85,9 +84,9 @@ def __init__(
 
     def prepare_feat_extract_dict(self):
         return {
-            "feature_size":self.feature_size,
+            "feature_size": self.feature_size,
             "hop_length": self.hop_length,
-            "chunk_length":self.chunk_length,
+            "chunk_length": self.chunk_length,
             "padding_value": self.padding_value,
             "sampling_rate": self.sampling_rate,
             "return_attention_mask": self.return_attention_mask,
@@ -132,9 +131,9 @@ def test_feat_extract_from_and_save_pretrained(self):
         dict_second = feat_extract_second.to_dict()
         mel_1 = dict_first.pop("mel_filters")
         mel_2 = dict_second.pop("mel_filters")
-        self.assertTrue(np.allclose(mel_1,mel_2))
+        self.assertTrue(np.allclose(mel_1, mel_2))
         self.assertEqual(dict_first, dict_second)
-    
+
     def test_feat_extract_to_json_file(self):
         feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
 
@@ -147,7 +146,7 @@ def test_feat_extract_to_json_file(self):
         dict_second = feat_extract_second.to_dict()
         mel_1 = dict_first.pop("mel_filters")
         mel_2 = dict_second.pop("mel_filters")
-        self.assertTrue(np.allclose(mel_1,mel_2))
+        self.assertTrue(np.allclose(mel_1, mel_2))
         self.assertEqual(dict_first, dict_second)
 
     def test_call(self):
@@ -173,13 +172,13 @@ def test_call(self):
         encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
         for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
             self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
-        
+
         # Test truncation required
-        speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples+500), 200)]
+        speech_inputs = [floats_list((1, x))[0] for x in range(200, (feature_extractor.n_samples + 500), 200)]
         np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
 
-        speech_inputs_truncated = [ x[:feature_extractor.n_samples] for x in speech_inputs]
-        np_speech_inputs_truncated  = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
+        speech_inputs_truncated = [x[: feature_extractor.n_samples] for x in speech_inputs]
+        np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
 
         encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
         encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
@@ -198,7 +197,7 @@ def test_double_precision_pad(self):
             self.assertTrue(np_processed.input_features.dtype == np.float32)
             pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
             self.assertTrue(pt_processed.input_features.dtype == torch.float32)
-    
+
     def _load_datasamples(self, num_samples):
         from datasets import load_dataset
 
@@ -208,19 +207,19 @@ def _load_datasamples(self, num_samples):
 
         return [x["array"] for x in speech_samples]
 
-    def test_integration(self): 
+    def test_integration(self):
         # fmt: off
         EXPECTED_INPUT_FEATURES = torch.tensor(
-            [   
-                0.1193, -0.0946, -0.1098, -0.0196,  0.0225, -0.0690, -0.1736,  0.0951,
-                0.0971, -0.0817, -0.0702,  0.0162,  0.0260,  0.0017, -0.0192, -0.1678,
+            [
+                0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951,
+                0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678,
                 0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554,
-                -0.0274, -0.1425, -0.1423,  0.0837,  0.0377, -0.0854
+                -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854
             ]
         )
         # fmt: on
 
         input_speech = self._load_datasamples(1)
         feaure_extractor = WhisperFeatureExtractor()
-        input_features = feaure_extractor(input_speech,return_tensors="pt").input_features
-        self.assertTrue(torch.allclose(input_features[0,0,:30],EXPECTED_INPUT_FEATURES, atol = 1e-4))
+        input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
+        self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 14c6a10f245c6..2daed9cf4c2d0 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -761,7 +761,7 @@ def _load_datasamples(self, num_samples):
 
         return [x["array"] for x in speech_samples]
 
-    def test_inference_tiny(self):
+    def test_tiny_logits_librispeech(self):
         torch_device = "cpu"
         set_seed(0)
         model = WhisperModel.from_pretrained("openai/whisper-tiny")
@@ -783,10 +783,10 @@ def test_inference_tiny(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.9892, -6.7607,  5.7348,  3.6095,  0.2152, -5.7321,  4.8855, -1.6407,
-                0.2823, -1.5718, 10.4269,  3.4427,  0.0219, -8.0612,  3.4784,  8.4246,
-                4.0575, -2.2864, 11.1084,  0.9963,  0.9884, -8.5154, -3.5469, -9.3714,
-                0.9786,  3.5435,  7.4850, -5.2579, -1.4366, 10.4841
+                2.9892, -6.7607, 5.7348, 3.6095, 0.2152, -5.7321, 4.8855, -1.6407,
+                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
+                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3714,
+                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
             ]
         )
         # fmt: on
@@ -795,10 +795,10 @@ def test_inference_tiny(self):
         # fmt: off
         EXPECTED_GENERATION = torch.tensor(
             [
-                -1.4651, -2.6944,  2.7821,  2.3793,  4.0738,  0.0188, -3.3204,  1.9836,
-                0.0520,  0.7095,  1.1063,  0.2952, -3.6786, -0.5249,  0.3105,  4.7691,
-                1.1562,  1.3046,  0.5810, -0.3624,  1.7006,  1.3424,  0.9817,  2.1958,
-                1.8775, -5.7046, -0.7679,  4.0113,  2.6848,  2.8609
+                -1.4651, -2.6944, 2.7821, 2.3793, 4.0738, 0.0188, -3.3204, 1.9836,
+                0.0520, 0.7095, 1.1063, 0.2952, -3.6786, -0.5249, 0.3105, 4.7691,
+                1.1562, 1.3046, 0.5810, -0.3624, 1.7006, 1.3424, 0.9817, 2.1958,
+                1.8775, -5.7046, -0.7679, 4.0113, 2.6848, 2.8609
             ]
         )
         # fmt: on
@@ -806,7 +806,7 @@ def test_inference_tiny(self):
         head_logits = logits[0] @ model.decoder.embed_tokens.weight.T
         self.assertTrue(torch.allclose(head_logits[0, 0, :30].cpu(), EXPECTED_GENERATION, atol=1e-4))
 
-    def test_small_logits_librispeech(self):
+    def test_small_en_logits_librispeech(self):
         set_seed(0)
         torch_device = "cpu"
         model = WhisperModel.from_pretrained("openai/whisper-small.en")
@@ -830,11 +830,11 @@ def test_small_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                -3.6784,  -7.7212,  -9.5070, -11.9286,  -7.6489,  -9.7026,  -5.6188,
-                -8.0104,  -4.6239,  -5.1833,  -9.0485,  -3.4079,  -5.4874,  -2.6935,
-                -6.3479,  -7.3398,  -6.9558,  -7.6867,  -7.4748,  -8.3463,  -9.9781,
-                -10.8389, -10.3105, -11.7201,  -9.7261,  -7.1590,  -5.9272, -12.4509,
-                -11.1147,  -8.1918
+                -3.6784, -7.7212, -9.5070, -11.9286, -7.6489, -9.7026, -5.6188,
+                -8.0104, -4.6239, -5.1833, -9.0485, -3.4079, -5.4874, -2.6935,
+                -6.3479, -7.3398, -6.9558, -7.6867, -7.4748, -8.3463, -9.9781,
+                -10.8389, -10.3105, -11.7201, -9.7261, -7.1590, -5.9272, -12.4509,
+                -11.1147, -8.1918
             ]
         )
         # fmt: on
@@ -855,7 +855,6 @@ def test_large_logits_librispeech(self):
         input_features = processed_inputs.input_features.to(torch_device)
         labels = processed_inputs.labels.to(torch_device)
 
-       
         logits = model(
             input_features,
             decoder_input_ids=labels,
@@ -869,17 +868,17 @@ def test_large_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.1382,  0.9381,  4.4671,  3.5589,  2.4022,  3.8577, -0.6521,  2.5472,
-                1.8301,  1.9957,  2.3432,  1.4678,  0.5459,  2.2597,  1.5179,  2.5357,
-                1.1624,  0.6194,  1.0757,  1.8259,  2.4076,  1.6601,  2.3503,  1.3376,
-                1.9891,  1.8635,  3.8931,  5.3699,  4.4772,  3.9184
+                2.1382, 0.9381, 4.4671, 3.5589, 2.4022, 3.8577, -0.6521, 2.5472,
+                1.8301, 1.9957, 2.3432, 1.4678, 0.5459, 2.2597, 1.5179, 2.5357,
+                1.1624, 0.6194, 1.0757, 1.8259, 2.4076, 1.6601, 2.3503, 1.3376,
+                1.9891, 1.8635, 3.8931, 5.3699, 4.4772, 3.9184
             ]
         )
         # fmt: on
 
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
-    def test_generation_en_only(self):
+    def test_tiny_en_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
@@ -903,7 +902,7 @@ def test_generation_en_only(self):
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
     @slow
-    def test_generation(self):
+    def test_tiny_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
@@ -918,9 +917,7 @@ def test_generation(self):
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
 
         decoder_input_ids = torch.tensor([[50258]]).long()
-        generated_ids = model.generate(
-            input_features, num_beams=5, decoder_input_ids=decoder_input_ids
-        )
+        generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids=decoder_input_ids)
         transcript = tokenizer.decode(generated_ids[0])
 
         EXPECTED_TRANSCRIPT = (
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index beebdb310080d..5999c6d225640 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -14,7 +14,6 @@
 
 import unittest
 
-from transformers import SPIECE_UNDERLINE
 from transformers.models.whisper import WhisperTokenizer
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
 
@@ -35,8 +34,8 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     def setUp(self):
         super().setUp()
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
-        tokenizer.pad_token_id=50256
-        tokenizer.pad_token="<|endoftext|>"
+        tokenizer.pad_token_id = 50256
+        tokenizer.pad_token = "<|endoftext|>"
         tokenizer.save_pretrained(self.tmpdirname)
 
     def test_convert_token_and_id(self):
@@ -51,7 +50,7 @@ def test_get_vocab(self):
         vocab_keys = list(self.get_tokenizer().get_vocab().keys())
 
         self.assertEqual(vocab_keys[0], "!")
-        self.assertEqual(vocab_keys[1], "\"")
+        self.assertEqual(vocab_keys[1], '"')
         self.assertEqual(vocab_keys[-1], "<|notimestamps|>")
         self.assertEqual(len(vocab_keys), 50364)
 
@@ -62,7 +61,7 @@ def test_full_tokenizer(self):
         tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
 
         tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ['This', 'Ġis', 'Ġa', 'Ġ', 'test'])
+        self.assertListEqual(tokens, ["This", "Ġis", "Ġa", "Ġ", "test"])
 
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(tokens),
@@ -73,7 +72,7 @@ def test_full_tokenizer(self):
         self.assertListEqual(
             tokens,
             # fmt: off
-            ['I', 'Ġwas','Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġ', 'this', 'Ġis', 'Ġfals', 'Ã©', '.' ],
+            ['I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġ', 'this', 'Ġis', 'Ġfals', 'Ã©', '.'],
             # fmt: on
         )
         ids = tokenizer.convert_tokens_to_ids(tokens)
@@ -83,7 +82,7 @@ def test_full_tokenizer(self):
         self.assertListEqual(
             back_tokens,
             # fmt: off
-            ['I', 'Ġwas','Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġ', 'this', 'Ġis', 'Ġfals', 'Ã©', '.' ],
+            ['I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġ', 'this', 'Ġis', 'Ġfals', 'Ã©', '.'],
             # fmt: on
         )
 
@@ -93,13 +92,11 @@ def test_tokenizer_slow_store_full_signature(self):
     @slow
     def test_tokenizer_integration(self):
         # fmt: off
-        expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
+        expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # noqa: E501
         # fmt: on
 
         self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="openai/whisper-tiny.en",
-            padding=False
+            expected_encoding=expected_encoding, model_name="openai/whisper-tiny.en", padding=False
         )
 
 
@@ -113,6 +110,7 @@ class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
     )
     clean_transcript = "  Nor is Mr. Quilters manner less interesting than his matter."
     french_text = "Bonjour! Il me semble que Mrs Quilters n'était pas présente"
+
     @classmethod
     def setUpClass(cls):
         cls.tokenizer: WhisperTokenizer = WhisperTokenizer.from_pretrained(cls.checkpoint_name)
@@ -131,7 +129,7 @@ def test_tokenizer_equivalence(self):
         assert len(gpt2_tokens) > len(multilingual_tokens)
 
         # fmt: off
-        EXPECTED_ENG= [
+        EXPECTED_ENG = [
             46695, 97, 167, 252, 234, 168, 98, 238, 220, 169,
             245, 234, 23821, 111, 229, 167, 108, 242, 169, 222,
             112, 168, 245, 238, 220, 169, 225, 222, 166, 111,
@@ -154,9 +152,9 @@ def test_tokenizer_special(self):
 
         # fmt: off
         EXPECTED_MULTI = [
-            50257, 10814,     0,  1374,   389,   345,  4203,    30,   449,     6,
-            1872,   300,     6, 11011,  2234,  8358, 16268,   225,   115, 43357,
-            22174,  1556,   778, 25792,    83, 50256
+            50257, 10814, 0, 1374, 389, 345, 4203, 30, 449, 6,
+            1872, 300, 6, 11011, 2234, 8358, 16268, 225, 115, 43357,
+            22174, 1556, 778, 25792, 83, 50256
         ]
         # fmt: on
 
@@ -164,7 +162,7 @@ def test_tokenizer_special(self):
 
         self.assertEqual(text, multilingual_tokenizer.decode(multilingual_tokens))
 
-        transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens = True)
+        transcript = multilingual_tokenizer.decode(multilingual_tokens, skip_special_tokens=True)
 
         EXPECTED_JAP = "Hey! How are you feeling? J'ai l'impression que 郷さん est prêt"
         self.assertEqual(transcript, EXPECTED_JAP)
@@ -178,4 +176,4 @@ def test_tokenizer_decode_ignores_language_codes(self):
         result = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
         expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
         self.assertEqual(result, expected_spanish)
-        self.assertNotIn(self.tokenizer.eos_token, result)
\ No newline at end of file
+        self.assertNotIn(self.tokenizer.eos_token, result)

From d16d3e130c4bb38b075f2e8ba4815888069ca624 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 11:07:26 +0000
Subject: [PATCH 067/156] invalidated hf token. Clean convert openai to whisper

---
 .../whisper/convert_openai_whisper_to_tfms.py | 66 ++-----------------
 1 file changed, 6 insertions(+), 60 deletions(-)

diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 8e9d0b5ce6bfc..e8491cb8f09b8 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -23,14 +23,7 @@
 from torch import nn
 from tqdm import tqdm
 
-from transformers import (
-    WhisperConfig,
-    WhisperFeatureExtractor,
-    WhisperForConditionalGeneration,
-    WhisperModel,
-    WhisperProcessor,
-    WhisperTokenizer,
-)
+from transformers import WhisperConfig, WhisperForConditionalGeneration
 
 
 def remove_ignore_keys_(state_dict):
@@ -184,10 +177,10 @@ def _download(url: str, root: str) -> bytes:
 
 
 def convert_every_model(save_dir="whisper"):
-    layers = [4, 6, 12, 24, 32]
-    width = [384, 512, 768, 1024, 1280]
-    heads = [6, 8, 12, 16, 20]
-    name = ["tiny", "base", "small", "medium", "large"]
+    layers = [4, 6, 12, 24, 32, 4, 6, 12, 24]
+    width = [384, 512, 768, 1024, 1280, 384, 512, 768, 1024]
+    heads = [6, 8, 12, 16, 20, 6, 8, 12, 16]
+    name = ["tiny", "base", "small", "medium", "large", "tiny.en", "base.en", "small.en", "medium.en"]
     for l, w, h, n in zip(layers, width, heads, name):
 
         config = WhisperConfig(
@@ -200,13 +193,12 @@ def convert_every_model(save_dir="whisper"):
             decoder_ffn_dim=4 * w,
             encoder_ffn_dim=4 * w,
         )
-        model = WhisperModel(config)
+        model = WhisperForConditionalGeneration(config)
 
         model_bytes, _ = _download(_MODELS[n], "original-weights")
         with io.BytesIO(model_bytes) as fp:
             original = torch.load(fp, map_location="cpu")["model_state_dict"]
 
-        # original = torch.load(f"/home/arthur_huggingface_co/whisper/tiny.pt")
         new = rename_keys(original.copy())
 
         missing, unexpected = model.load_state_dict(new, strict=False)
@@ -214,53 +206,7 @@ def convert_every_model(save_dir="whisper"):
             print("succesfully loaded")
             model.save_pretrained(f"{save_dir}/{n}")
 
-        checkpoint_path = f"openai/whisper-{n}"
-        model.push_to_hub(checkpoint_path, use_auth_token="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")
 
-        tokenizer = WhisperTokenizer.from_pretrained("ArthurZ/whisper-small.en")
-        # tokenizer.push_to_hub(checkpoint_path, use_auth_token = "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")
-
-        feature_extractor = WhisperFeatureExtractor()
-        processor = WhisperProcessor(feature_extractor, tokenizer)
-        processor.push_to_hub(checkpoint_path, use_auth_token="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")
-
-    # for en only, decoder input_ids = 50257
-
-    for l, w, h, n in zip(layers, width, heads, name):
-        n += ".en"
-        config = WhisperConfig(
-            vocab_size=51864,
-            encoder_layers=l,
-            encoder_attention_heads=h,
-            decoder_attention_heads=h,
-            decoder_layers=l,
-            d_model=w,
-            decoder_ffn_dim=4 * w,
-            encoder_ffn_dim=4 * w,
-        )
-        model = WhisperModel(config)
-
-        model_bytes, _ = _download(_MODELS[n], "original-weights")
-        with io.BytesIO(model_bytes) as fp:
-            original = torch.load(fp, map_location="cpu")["model_state_dict"]
-
-        # original = torch.load(f"/home/arthur_huggingface_co/whisper/tiny.pt")
-        new = rename_keys(original.copy())
-
-        missing, unexpected = model.load_state_dict(new, strict=False)
-        if missing == []:
-            print("succesfully loaded")
-            model.save_pretrained(f"{save_dir}/{n}")
-
-        checkpoint_path = f"openai/whisper-{n}"
-        model.push_to_hub(checkpoint_path, use_auth_token="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")
-
-        tokenizer = WhisperTokenizer.from_pretrained("/home/arthur_huggingface_co/transformers/whisper-any.en")
-        # tokenizer.push_to_hub(checkpoint_path, use_auth_token = "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")
-
-        feature_extractor = WhisperFeatureExtractor()
-        processor = WhisperProcessor(feature_extractor, tokenizer)
-        processor.push_to_hub(checkpoint_path, use_auth_token="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")
 
 
 if __name__ == "__main__":

From f62fd14926ecd4dea7dcb06d9b4cc934df4def73 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 11:58:59 +0000
Subject: [PATCH 068/156] fix logit tests

---
 src/transformers/models/whisper/modeling_whisper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 9f840979f62b3..69a387bd79c7d 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -901,7 +901,7 @@ def forward(
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
-        attention_mask = None
+        
         encoder_attention_mask = None
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
@@ -1086,6 +1086,8 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        attention_mask = None
+
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_features,

From d4efa53757da17d3daa33be6ce7479493f948dde Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 11:59:46 +0000
Subject: [PATCH 069/156] fixup

---
 .../models/whisper/convert_openai_whisper_to_tfms.py            | 2 --
 src/transformers/models/whisper/modeling_whisper.py             | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index e8491cb8f09b8..3c637c92e0053 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -207,8 +207,6 @@ def convert_every_model(save_dir="whisper"):
             model.save_pretrained(f"{save_dir}/{n}")
 
 
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     # # Required parameters
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 69a387bd79c7d..d22c6c715102e 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -901,7 +901,7 @@ def forward(
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
-        
+
         encoder_attention_mask = None
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:

From cf156ce02389ce08aa1dc1791bb2ddf57b134d41 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 12:06:33 +0000
Subject: [PATCH 070/156] clean merge

---
 .../models/whisper/modeling_whisper.py             | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 44324806c8956..7c038bf42555a 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -438,7 +438,6 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextPreTrainedModel with Speech2Text->Whisper
 class WhisperPreTrainedModel(PreTrainedModel):
     config_class = WhisperConfig
     base_model_prefix = "model"
@@ -460,8 +459,13 @@ def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (WhisperDecoder, WhisperEncoder)):
             module.gradient_checkpointing = value
 
-    def _get_feat_extract_output_lengths(self, input: int):
-        return (input - 1) // 2 + 1
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
 
     def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
         # generate creates 3D attention mask, because of the shape of input_features
@@ -469,7 +473,7 @@ def _get_feature_vector_attention_mask(self, feature_vector_length, attention_ma
         if len(attention_mask.shape) > 2:
             attention_mask = attention_mask[:, 0, :]
 
-        subsampled_lengths = ((attention_mask.sum(-1) - 1) // 2) + 1
+        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
         bsz = attention_mask.size()[0]
         attention_mask = torch.zeros(
             (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
@@ -1223,7 +1227,7 @@ def forward(
 
         >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
-        
+
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 
         >>> inputs = processor(

From 570941b477fb4755debb05b3074308847f2573bc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 16:02:39 +0000
Subject: [PATCH 071/156] revert toc_tree changes

---
 docs/source/en/_toctree.yml | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 2f85a5a8beee5..eeb69761f5fb2 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,8 +42,7 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -59,6 +58,7 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
+      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -101,6 +101,8 @@
       title: Instantiating a big model
     - local: debugging
       title: Debugging
+    - local: hpo_train
+      title: Hyperparameter Search using Trainer API
     title: Performance and scalability
   - sections:
     - local: contributing
@@ -443,6 +445,8 @@
         title: XLS-R
       - local: model_doc/xlsr_wav2vec2
         title: XLSR-Wav2Vec2
+      - local: model_doc/whisper
+        title: Whisper
       title: Audio models
     - isExpanded: false
       sections:
@@ -480,8 +484,6 @@
         title: Vision Text Dual Encoder
       - local: model_doc/visual_bert
         title: VisualBERT
-      - local: model_doc/whisper
-        title: Whisper
       - local: model_doc/xclip
         title: X-CLIP
       title: Multimodal models
@@ -507,4 +509,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file

From 261c1f204972710dd47f0a8daadec5ced066e5e3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 16:07:58 +0000
Subject: [PATCH 072/156] remove useless LogitProcessor

---
 src/transformers/generation_logits_process.py | 42 -------------------
 1 file changed, 42 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 2d12e1c5d07a1..97d8ba262b10a 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -717,45 +717,3 @@ def __call__(self, input_ids, scores):
         logits = scores
         logits[:, self.suppress_tokens] = -np.inf
         return logits
-
-
-class ApplyTimestampRules(LogitsProcessor):
-    r""" """
-
-    def __init__(self, tokenizer, sample_begin: int = 1, max_initial_timestamp_index: Optional[int] = None):
-        self.tokenizer = tokenizer
-        self.sample_begin = sample_begin
-        self.max_initial_timestamp_index = max_initial_timestamp_index
-
-    def __call__(self, input_ids, scores):
-        tokens = input_ids
-        logits = scores
-        # suppress <|notimestamps|> which is handled by without_timestamps
-        if self.tokenizer.no_timestamps is not None:
-            logits[:, self.tokenizer.no_timestamps] = -np.inf
-
-        # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
-        for k in range(tokens.shape[0]):
-            seq = [t for t in tokens[k, self.sample_begin :].tolist()]
-            last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.tokenizer.timestamp_begin
-            penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.tokenizer.timestamp_begin
-
-            if last_was_timestamp:
-                if penultimate_was_timestamp:  # has to be non-timestamp
-                    logits[k, self.tokenizer.timestamp_begin :] = -np.inf
-                else:  # cannot be normal text tokens
-                    logits[k, : self.tokenizer.eot] = -np.inf
-
-        # apply the `max_initial_timestamp` option
-        if tokens.shape[1] == self.sample_begin and self.max_initial_timestamp_index is not None:
-            last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
-            logits[:, last_allowed + 1 :] = -np.inf
-
-        # if sum of probability over timestamps is above any other token, sample timestamp
-        logprobs = torch.nn.functional.log_softmax(logits.float(), dim=-1)
-        for k in range(tokens.shape[0]):
-            timestamp_logprob = logprobs[k, self.tokenizer.timestamp_begin :].logsumexp(dim=-1)
-            max_text_token_logprob = logprobs[k, : self.tokenizer.timestamp_begin].max()
-            if timestamp_logprob > max_text_token_logprob:
-                logits[k, : self.tokenizer.timestamp_begin] = -np.inf
-        return logits

From d65e755744050e22cda3cb475ef8a43701831f5d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 16:08:05 +0000
Subject: [PATCH 073/156] Update whisper .mdx

---
 docs/source/en/model_doc/whisper.mdx | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
index 25ddb6d7c0e83..711738d83a354 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -24,10 +24,11 @@ The abstract from the paper is the following:
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- The model usually performs well without requiring any finetuning. 
+- The architecture follows a classic Encoder/Decoder architecture, which means that it relies on the [`generate`](https://huggingface.co/docs/transformers/v4.22.2/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate) function 
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
+The original code can be found [here](https://github.com/openai/whisper).
 
 
 ## WhisperConfig

From aa957770fa67206ba252ceba0b97894bab8d6c8a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 16:28:52 +0000
Subject: [PATCH 074/156] update config file doc

---
 .../models/whisper/configuration_whisper.py   | 84 ++++++++++---------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index ad90e3a1bff5d..b6eb72f670821 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -62,62 +62,65 @@ class WhisperConfig(PretrainedConfig):
 
 
     Args:
-        vocab_size (`int`, *optional*, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 51865):
             Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`WhisperModel`]
-        d_model (`int`, *optional*, defaults to 1024):
-            Dimensionality of the layers and the pooler layer.
-        encoder_layers (`int`, *optional*, defaults to 12):
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            Number of mel features used per input features. Should correspond to the value used in the `WhisperProcessor``
+            class. 
+        encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 6):
             Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 1536):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+        encoder_ffn_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_start_token_id (`int`, *optional*, defaults to 50258):
+            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids` 
+            are provided to the `generate`function
+        use_cache (`bool`, *optional*, defaults to True):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        is_encoder_decoder (`bool`, *optional*, defaults to True):
+            _description_
+        activation_function (`str`, *optional*, defaults to "gelu"):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimensionality of the layers and the pooler layer.
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
-        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-        max_source_positions (`int`, *optional*, defaults to 6000):
+        scale_embedding (`bool`, *optional*, defaults to False):
+            _description_
+        max_source_positions (`int`, *optional*, defaults to 1500):
             The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
-        max_target_positions (`int`, *optional*, defaults to 1024):
+        max_target_positions (`int`, *optional*, defaults to 448):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        num_conv_layers (`int`, *optional*, defaults to 2):
-            Number of 1D convolutional layers in the conv module.
-        conv_kernel_sizes (`Tuple[int]`, *optional*, defaults to `(5, 5)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the conv module. The length
-            of `conv_kernel_sizes` has to match `num_conv_layers`.
-        conv_channels (`int`, *optional*, defaults to 1024):
-            An integer defining the number of output channels of each convolution layers except the final one in the
-            conv module.
-        input_feat_per_channel (`int`, *optional*, defaults to 80):
-            An integer specifying the size of feature vector. This is also the dimensions of log-mel filter-bank
-            features.
-        input_channels (`int`, *optional*, defaults to 1):
-            An integer specifying number of input channels of the input feature vector.
+        pad_token_id (`int`, *optional*, defaults to 50256):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Begin of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50257):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to True):
+            Whether to tie input and output embeddings.
 
     Example:
 
@@ -140,7 +143,6 @@ class WhisperConfig(PretrainedConfig):
     def __init__(
         self,
         vocab_size=51865,
-        feature_size=1,
         num_mel_bins=80,
         encoder_layers=6,
         encoder_attention_heads=4,
@@ -165,10 +167,14 @@ def __init__(
         pad_token_id=0,
         bos_token_id=50257,
         eos_token_id=50257,
-        input_channels=1,
         tie_word_embeddings=True,
         **kwargs
     ):
+        """_summary_
+
+        Args:
+
+        """
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
         self.d_model = d_model
@@ -189,10 +195,8 @@ def __init__(
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
         self.tie_word_embeddings = tie_word_embeddings
-        self.input_channels = input_channels
         self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
-        self.feature_size = feature_size
         self.non_speech_tokens = NON_SPEECH_TOKENS
         super().__init__(
             pad_token_id=pad_token_id,

From 0a23c18bd299fdf9ab703dfb0bca645109850bae Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 16:34:11 +0000
Subject: [PATCH 075/156] update configuration docstring

---
 src/transformers/models/whisper/configuration_whisper.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index b6eb72f670821..f5f4441f2b41d 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -55,7 +55,7 @@ class WhisperConfig(PretrainedConfig):
     This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate an
     Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the Whisper
-    [openai/whisper-base](https://huggingface.co/openai/whisper-base) architecture.
+    [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -121,7 +121,9 @@ class WhisperConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to True):
             Whether to tie input and output embeddings.
-
+        non_speech_tokens (`List[int]`, *optional*, defaults to None):
+            A list containing the non-speech tokens that will be used by the logit processor in the `generate`
+            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here. 
     Example:
 
     ```python
@@ -168,6 +170,7 @@ def __init__(
         bos_token_id=50257,
         eos_token_id=50257,
         tie_word_embeddings=True,
+        non_speech_tokens=None,
         **kwargs
     ):
         """_summary_
@@ -197,7 +200,7 @@ def __init__(
         self.tie_word_embeddings = tie_word_embeddings
         self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
-        self.non_speech_tokens = NON_SPEECH_TOKENS
+        self.non_speech_tokens = non_speech_tokens
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

From 530095630b1b34ffaf9bc12b82fb285312fd09d5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 17:38:31 +0000
Subject: [PATCH 076/156] update test tokenization

---
 tests/models/whisper/test_tokenization_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 5999c6d225640..4dc49500c81a5 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -26,7 +26,7 @@
 
 @require_sentencepiece
 @require_tokenizers
-class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
+class WhisperTokenizeirTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = WhisperTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = False

From afcf30d9c8380af1a1e3fab00a3471b0ee6aa13c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 17:38:47 +0000
Subject: [PATCH 077/156] update test tokenization

---
 tests/models/whisper/test_tokenization_whisper.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 4dc49500c81a5..65254ff577c4d 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -15,7 +15,7 @@
 import unittest
 
 from transformers.models.whisper import WhisperTokenizer
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+from transformers.testing_utils import slow
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -24,8 +24,7 @@
 ES_CODE = 50256
 
 
-@require_sentencepiece
-@require_tokenizers
+
 class WhisperTokenizeirTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = WhisperTokenizer
     test_rust_tokenizer = False

From 03f11d822bb1fddd4322fd308740d9d311a94e54 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 17:39:15 +0000
Subject: [PATCH 078/156] update tokenization whisper Added copied from where
 needed

---
 .../models/whisper/tokenization_whisper.py    | 99 +++----------------
 1 file changed, 14 insertions(+), 85 deletions(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index d8b2bfadb4948..12a1622f02bbc 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -15,7 +15,6 @@
 """Tokenization classes for Whisper."""
 import json
 import os
-from functools import lru_cache
 from typing import List, Optional, Tuple
 
 import regex as re
@@ -37,7 +36,7 @@
 }
 
 MAX_MODEL_INPUT_SIZES = {
-    "openai/whisper-base": 1024,
+    "openai/whisper-base": 448,
 }
 
 
@@ -160,8 +159,7 @@
     "castilian": "es",
 }
 
-
-@lru_cache()
+# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
 def bytes_to_unicode():
     """
     Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
@@ -188,7 +186,7 @@ def bytes_to_unicode():
 
 logger = logging.get_logger(__name__)
 
-
+# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
 def get_pairs(word):
     """
     Return set of symbol pairs in a word.
@@ -284,70 +282,13 @@ def __init__(
 
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
+    
     def get_vocab(self):
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
 
     @property
-    @lru_cache()
-    def sot_sequence(self) -> Tuple[int]:
-        translate = self.all_special_ids[-6]
-        transcribe = self.all_special_ids[-5]
-        sot_sequence = [self.all_special_ids[1]]
-
-        if self.language is not None:
-            additional_tokens = dict(
-                zip(
-                    self.additional_special_tokens,
-                    self.additional_special_tokens_ids,
-                )
-            )
-            self.language_token = additional_tokens[f"<|{self.language}|>"]
-            langs = tuple(LANGUAGES.keys())
-            sot_sequence.append(self.all_special_ids[1] + 1 + langs.index(self.language))
-
-        if self.task is not None:
-            sot_sequence.append(transcribe if self.task == "transcribe" else translate)
-        return sot_sequence
-
-    def _get_single_token_id(self, text) -> int:
-        tokens = self.encode(text)
-        return tokens[0]
-
-    @property
-    @lru_cache()
-    def sot(self) -> int:
-        return self._get_single_token_id("<|startoftranscript|>")
-
-    @property
-    @lru_cache()
-    def sot_lm(self) -> int:
-        return self._get_single_token_id("<|startoflm|>")
-
-    @property
-    @lru_cache()
-    def sot_prev(self) -> int:
-        return self._get_single_token_id("<|startofprev|>")
-
-    @property
-    @lru_cache()
-    def no_captions(self) -> int:
-        return self._get_single_token_id("<|nocaptions|>")
-
-    @property
-    @lru_cache()
-    def no_timestamps(self) -> int:
-        return self._get_single_token_id("<|notimestamps|>")
-
-    @property
-    @lru_cache()
-    def timestamp_begin(self) -> int:
-        return self.tokenizer.all_special_ids[-1] + 1
-
-    @property
-    @lru_cache()
     def all_language_tokens(self) -> Tuple[int]:
         result = []
         for token, token_id in zip(
@@ -359,19 +300,15 @@ def all_language_tokens(self) -> Tuple[int]:
         return tuple(result)
 
     @property
-    @lru_cache()
     def all_language_codes(self) -> Tuple[str]:
         return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
 
-    @property
-    @lru_cache()
-    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
-        return tuple(list(self.sot_sequence) + [self.no_timestamps])
-
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size with GPT2 -> Whisper
     @property
     def vocab_size(self) -> int:
         return len(self.encoder)
 
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe with GPT2 -> Whisper
     def bpe(self, token):
         if token in self.cache:
             return self.cache[token]
@@ -414,6 +351,7 @@ def bpe(self, token):
         self.cache[token] = word
         return word
 
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.build_inputs_with_special_tokens with GPT2 -> Whisper
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         if self.add_bos_token:
             bos_token_ids = [self.bos_token_id]
@@ -427,6 +365,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
 
         return output + bos_token_ids + token_ids_1
 
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_special_tokens_mask with GPT2 -> Whisper
     def get_special_tokens_mask(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
@@ -459,6 +398,7 @@ def get_special_tokens_mask(
             return [1] + ([0] * len(token_ids_0))
         return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
 
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize with GPT2 -> Whisper
     def _tokenize(self, text):
         """Tokenize a string."""
         bpe_tokens = []
@@ -469,6 +409,7 @@ def _tokenize(self, text):
             bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
         return bpe_tokens
 
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id with GPT2 -> Whisper
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
@@ -477,12 +418,14 @@ def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         return self.decoder.get(index, self.decoder.get(self.unk_token_id))
 
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string with GPT2 -> Whisper
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
         text = "".join(tokens)
         text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
         return text
 
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary with GPT2 -> Whisper
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
@@ -512,28 +455,14 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
 
         return vocab_file, merge_file
 
-    def decode_with_timestamps(self, tokens) -> str:
-        """
-        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
-        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
-        """
-        outputs = [[]]
-        for token in tokens:
-            if token >= self.timestamp_begin:
-                timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
-                outputs.append(timestamp)
-                outputs.append([])
-            else:
-                outputs[-1].append(token)
-        outputs = [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
-        return "".join(outputs)
-
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.prepare_for_tokenization with GPT2 -> Whisper
     def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
         if is_split_into_words or add_prefix_space:
             text = " " + text
         return (text, kwargs)
 
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._build_conversation_input_ids with GPT2 -> Whisper
     def _build_conversation_input_ids(self, conversation) -> List[int]:
         input_ids = []
         for is_user, text in conversation.iter_texts():

From 017010ff550f4da0f2d3f6ab1d1ad8390806d5b8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 17:39:31 +0000
Subject: [PATCH 079/156] update feature extraction

---
 .../models/whisper/feature_extraction_whisper.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 586be7a2c1955..8ae504d2fa49c 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -49,16 +49,14 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
             The feature dimension of the extracted features.
         sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        num_mel_bins (`int`, defaults to 80):
-            Number of Mel-frequency bins.
+        hop_length (`int`, defaults to 160):
+            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients. 
+        chunk_length (`int`, defaults to 30):
+            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio sequences.
+        n_fft (`int`, defaults to 30):
+            Size of the Fourier transform.
         padding_value (`float`, defaults to 0.0):
-            The value that is used to fill the padding vectors.
-        do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
-            Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
-        normalize_means (`bool`, *optional*, defaults to `True`):
-            Whether or not to zero-mean normalize the extracted features.
-        normalize_vars (`bool`, *optional*, defaults to `True`):
-            Whether or not to unit-variance normalize the extracted features.
+            Padding value used to pad the audio. Should correspond to silences.
     """
 
     model_input_names = ["input_features", "attention_mask"]

From 9f0f3326a9287aa89d29040641cc2c52b972c3c6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 17:40:40 +0000
Subject: [PATCH 080/156] nit test name

---
 tests/models/whisper/test_tokenization_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 65254ff577c4d..dc5e24c443306 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -25,7 +25,7 @@
 
 
 
-class WhisperTokenizeirTest(TokenizerTesterMixin, unittest.TestCase):
+class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = WhisperTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = False

From fde6e991721adb3d1f11213bb15b3462c0e2dc53 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 17:42:28 +0000
Subject: [PATCH 081/156] style

---
 src/transformers/models/whisper/configuration_whisper.py    | 6 +++---
 .../models/whisper/feature_extraction_whisper.py            | 2 +-
 src/transformers/models/whisper/tokenization_whisper.py     | 2 +-
 tests/models/whisper/test_tokenization_whisper.py           | 2 --
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index f5f4441f2b41d..c710db036f334 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -67,7 +67,7 @@ class WhisperConfig(PretrainedConfig):
             `inputs_ids` passed when calling [`WhisperModel`]
         num_mel_bins (`int`, *optional*, defaults to 80):
             Number of mel features used per input features. Should correspond to the value used in the `WhisperProcessor``
-            class. 
+            class.
         encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 6):
@@ -87,7 +87,7 @@ class WhisperConfig(PretrainedConfig):
             The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
         decoder_start_token_id (`int`, *optional*, defaults to 50258):
-            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids` 
+            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
             are provided to the `generate`function
         use_cache (`bool`, *optional*, defaults to True):
             Whether or not the model should return the last key/values attentions (not used by all models).
@@ -123,7 +123,7 @@ class WhisperConfig(PretrainedConfig):
             Whether to tie input and output embeddings.
         non_speech_tokens (`List[int]`, *optional*, defaults to None):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
-            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here. 
+            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here.
     Example:
 
     ```python
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 8ae504d2fa49c..1bf9b8582cbbc 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -50,7 +50,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
         sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
         hop_length (`int`, defaults to 160):
-            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients. 
+            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
         chunk_length (`int`, defaults to 30):
             The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio sequences.
         n_fft (`int`, defaults to 30):
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 12a1622f02bbc..ce4d6a7cc2b1e 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -282,7 +282,7 @@ def __init__(
 
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-    
+
     def get_vocab(self):
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index dc5e24c443306..97dbf914044f2 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -24,7 +24,6 @@
 ES_CODE = 50256
 
 
-
 class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = WhisperTokenizer
     test_rust_tokenizer = False
@@ -99,7 +98,6 @@ def test_tokenizer_integration(self):
         )
 
 
-@require_sentencepiece
 class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
     checkpoint_name = "openai/whisper-small.en"
 

From 9cca7eb6e2e5c52a778fd14adbf87f8fb52a998e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 17:45:04 +0000
Subject: [PATCH 082/156] quality

---
 src/transformers/models/whisper/configuration_whisper.py  | 8 +++-----
 .../models/whisper/feature_extraction_whisper.py          | 3 ++-
 src/transformers/models/whisper/tokenization_whisper.py   | 3 ++-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index c710db036f334..96309e47015ba 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -66,8 +66,8 @@ class WhisperConfig(PretrainedConfig):
             Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`WhisperModel`]
         num_mel_bins (`int`, *optional*, defaults to 80):
-            Number of mel features used per input features. Should correspond to the value used in the `WhisperProcessor``
-            class.
+            Number of mel features used per input features. Should correspond to the value used in the
+            `WhisperProcessor`` class.
         encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 6):
@@ -175,9 +175,7 @@ def __init__(
     ):
         """_summary_
 
-        Args:
-
-        """
+        Args:"""
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
         self.d_model = d_model
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 1bf9b8582cbbc..4032a75471af1 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -52,7 +52,8 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
         hop_length (`int`, defaults to 160):
             Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
         chunk_length (`int`, defaults to 30):
-            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio sequences.
+            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
+            sequences.
         n_fft (`int`, defaults to 30):
             Size of the Fourier transform.
         padding_value (`float`, defaults to 0.0):
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index ce4d6a7cc2b1e..9d2ab73c4cc4a 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -159,6 +159,7 @@
     "castilian": "es",
 }
 
+
 # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
 def bytes_to_unicode():
     """
@@ -186,6 +187,7 @@ def bytes_to_unicode():
 
 logger = logging.get_logger(__name__)
 
+
 # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
 def get_pairs(word):
     """
@@ -303,7 +305,6 @@ def all_language_tokens(self) -> Tuple[int]:
     def all_language_codes(self) -> Tuple[str]:
         return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
 
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size with GPT2 -> Whisper
     @property
     def vocab_size(self) -> int:
         return len(self.encoder)

From fa690087098f00bcf145fa575c9e2e0b84989fe4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 17:57:40 +0000
Subject: [PATCH 083/156] remove get suppress tokens and update non_speech
 tokens global variables

---
 .../models/whisper/configuration_whisper.py   | 21 ++++++++++---------
 .../models/whisper/tokenization_whisper.py    | 19 -----------------
 2 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 96309e47015ba..e8b6b79964fcd 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -34,18 +34,19 @@
     4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
     11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
     17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
-    34949, 40283, 40493, 40549, 47282, 49146
+    34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360
 ]
 NON_SPEECH_TOKENS_MULTI = [
-    1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
-    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
-    63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
-    893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
-    3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
-    7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
-    14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
-    22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
-    42863, 47425, 49870, 50254
+    1, 2, 6, 7, 8, 9,
+    10, 12, 14, 25, 26, 27, 28, 29, 31, 58,
+    59, 60, 61, 62, 63, 90, 91, 92, 93, 359,
+    503, 522, 542, 873, 893, 902, 918, 922, 931, 1350,
+    1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961,
+    4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938,
+    12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604,
+    18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464,
+    31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50257, 50360,
+    50359
 ]
 # fmt: on
 
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 9d2ab73c4cc4a..2142d477cd957 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -471,22 +471,3 @@ def _build_conversation_input_ids(self, conversation) -> List[int]:
         if len(input_ids) > self.model_max_length:
             input_ids = input_ids[-self.model_max_length :]
         return input_ids
-
-    # TODO move to the logit processor
-    def _get_suppress_tokens(self, suppress_tokens=[]) -> Tuple[int]:
-
-        if isinstance(suppress_tokens, str):
-            suppress_tokens = [int(t) for t in suppress_tokens.split(",")]
-
-        if -1 in suppress_tokens:
-            suppress_tokens = [t for t in suppress_tokens if t >= 0]
-            suppress_tokens.extend(NON_SPEECH_TOKENS)
-        elif suppress_tokens is None or len(suppress_tokens) == 0:
-            suppress_tokens = []  # interpret empty string as an empty list
-
-        suppress_tokens.extend([self.sot, self.sot_prev, self.sot_lm])
-        if self.no_captions is not None:
-            # no-captions probability is collected separately
-            suppress_tokens.append(self.no_captions)
-
-        return tuple(sorted(set(suppress_tokens)))

From 69e2dce5c4679d94ad63d09089f59b33fa94b928 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 28 Sep 2022 20:03:42 +0200
Subject: [PATCH 084/156] Update
 src/transformers/models/whisper/feature_extraction_whisper.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/transformers/models/whisper/feature_extraction_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 4032a75471af1..f8331f4629fc6 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -60,7 +60,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
             Padding value used to pad the audio. Should correspond to silences.
     """
 
-    model_input_names = ["input_features", "attention_mask"]
+    model_input_names = ["input_features"]
 
     def __init__(
         self,

From 4243f7d84b65f66906cadb75e4bb5bd05aa548ca Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 19:32:00 +0000
Subject: [PATCH 085/156] clean modeling whisper and test Removed the attention
 mask arguments that are deprecated

---
 .../models/whisper/modeling_whisper.py        | 60 ++-----------------
 tests/models/whisper/test_modeling_whisper.py | 48 ++++-----------
 2 files changed, 18 insertions(+), 90 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 7c038bf42555a..1bc72eafa818a 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -467,23 +467,7 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
 
         return input_lengths
 
-    def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
-        # generate creates 3D attention mask, because of the shape of input_features
-        # convert it to 2D if thats the case
-        if len(attention_mask.shape) > 2:
-            attention_mask = attention_mask[:, 0, :]
-
-        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
-        bsz = attention_mask.size()[0]
-        attention_mask = torch.zeros(
-            (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
-        )
 
-        # these two operations makes sure that all values
-        # before the output lengths indices are attended to
-        attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1
-        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()
-        return attention_mask
 
 
 WHISPER_START_DOCSTRING = r"""
@@ -627,7 +611,6 @@ def __init__(self, config: WhisperConfig):
     def forward(
         self,
         input_features,
-        attention_mask=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -679,15 +662,6 @@ def forward(
         hidden_states = inputs_embeds + embed_pos
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
-        attention_mask = None
-        # subsample attention mask if necessary
-        if attention_mask is not None:
-            attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
-            attention_mask = attention_mask.ne(1).long()
-            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
-        # else:
-        #     attention_mask = torch.ones([], dtype=torch.long, device=inputs_embeds.device)
-
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
 
@@ -716,13 +690,13 @@ def custom_forward(*inputs):
                     layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(encoder_layer),
                         hidden_states,
-                        attention_mask,
+                        None,
                         (head_mask[idx] if head_mask is not None else None),
                     )
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
-                        attention_mask,
+                        None,
                         layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
@@ -803,7 +777,6 @@ def forward(
         input_ids=None,
         attention_mask=None,
         encoder_hidden_states=None,
-        encoder_attention_mask=None,
         head_mask=None,
         cross_attn_head_mask=None,
         past_key_values=None,
@@ -906,12 +879,6 @@ def forward(
             attention_mask, input_shape, inputs_embeds, past_key_values_length
         )
 
-        encoder_attention_mask = None
-        # expand encoder attention mask
-        if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
-
         # embed positions
         positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
 
@@ -962,7 +929,6 @@ def custom_forward(*inputs):
                     hidden_states,
                     attention_mask,
                     encoder_hidden_states,
-                    encoder_attention_mask,
                     head_mask[idx] if head_mask is not None else None,
                     cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
                     None,
@@ -973,7 +939,6 @@ def custom_forward(*inputs):
                     hidden_states,
                     attention_mask=attention_mask,
                     encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
                     layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                     cross_attn_layer_head_mask=(
                         cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
@@ -1046,7 +1011,6 @@ def get_decoder(self):
     def forward(
         self,
         input_features=None,
-        attention_mask=None,
         decoder_input_ids=None,
         decoder_attention_mask=None,
         head_mask=None,
@@ -1090,12 +1054,9 @@ def forward(
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        attention_mask = None
-
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_features,
-                attention_mask=attention_mask,
                 head_mask=head_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -1110,22 +1071,13 @@ def forward(
             )
 
         # downsample encoder attention mask
-        if attention_mask is not None:
-            encoder_attention_mask = self._get_feature_vector_attention_mask(
-                encoder_outputs[0].shape[1], attention_mask
-            )
-        else:
-            encoder_attention_mask = None
-
-        encoder_attention_mask = None
-        decoder_attention_mask = None
+        # decoder_attention_mask = None
 
         # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
-            encoder_attention_mask=encoder_attention_mask,
             head_mask=decoder_head_mask,
             cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
@@ -1195,7 +1147,6 @@ def set_output_embeddings(self, new_embeddings):
     def forward(
         self,
         input_features=None,
-        attention_mask=None,
         decoder_input_ids=None,
         decoder_attention_mask=None,
         head_mask=None,
@@ -1251,7 +1202,6 @@ def forward(
 
         outputs = self.model(
             input_features,
-            attention_mask=attention_mask,
             decoder_input_ids=decoder_input_ids,
             encoder_outputs=encoder_outputs,
             decoder_attention_mask=decoder_attention_mask,
@@ -1289,17 +1239,17 @@ def forward(
         )
 
     def prepare_inputs_for_generation(
-        self, decoder_input_ids, past=None, use_cache=None, encoder_outputs=None, **kwargs
+        self, decoder_input_ids,  past=None, use_cache=None, encoder_outputs=None, attention_mask = None, **kwargs
     ):
         # cut decoder_input_ids if past is used
         if past is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
 
+
         return {
             "encoder_outputs": encoder_outputs,
             "past_key_values": past,
             "decoder_input_ids": decoder_input_ids,
-            "attention_mask": None,
             "use_cache": use_cache,
             "decoder_attention_mask": None,
         }
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 2daed9cf4c2d0..7b272e4ce9d77 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -22,15 +22,7 @@
 
 from transformers import WhisperConfig
 from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
-from transformers.testing_utils import (
-    is_torch_available,
-    require_sentencepiece,
-    require_tokenizers,
-    require_torch,
-    require_torchaudio,
-    slow,
-    torch_device,
-)
+from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device
 from transformers.utils import cached_property
 
 from ...generation.test_generation_utils import GenerationTesterMixin
@@ -62,8 +54,6 @@ def prepare_whisper_inputs_dict(
     decoder_head_mask=None,
     cross_attn_head_mask=None,
 ):
-    if attention_mask is None:
-        attention_mask = input_features.ne(0)
     if decoder_attention_mask is None:
         decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
     if head_mask is None:
@@ -76,8 +66,7 @@ def prepare_whisper_inputs_dict(
         # "input_ids": input_features,
         "input_features": input_features,
         "decoder_input_ids": decoder_input_ids,
-        "attention_mask": attention_mask,
-        "decoder_attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
         "head_mask": head_mask,
         "decoder_head_mask": decoder_head_mask,
         "cross_attn_head_mask": cross_attn_head_mask,
@@ -136,17 +125,15 @@ def __init__(
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
-        attention_mask = torch.ones(
-            [self.batch_size, self.max_source_positions], dtype=torch.long, device=torch_device
-        )
+
         decoder_input_ids = torch.tensor(self.batch_size * [[self.decoder_start_token_id]], device=torch_device)
 
         config = self.get_config()
         inputs_dict = prepare_whisper_inputs_dict(
             config,
+            attention_mask = None,
             input_features=input_features,
             decoder_input_ids=decoder_input_ids,
-            attention_mask=attention_mask,
         )
         return config, inputs_dict
 
@@ -243,7 +230,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             encoder = WhisperEncoder.from_pretrained(tmpdirname).to(torch_device)
 
         encoder_last_hidden_state_2 = encoder(
-            inputs_dict["input_features"], attention_mask=inputs_dict["attention_mask"]
+            inputs_dict["input_features"]
         )[0]
 
         self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
@@ -253,15 +240,11 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             decoder.save_pretrained(tmpdirname)
             decoder = WhisperDecoder.from_pretrained(tmpdirname).to(torch_device)
 
-        encoder_attention_mask = encoder._get_feature_vector_attention_mask(
-            encoder_last_hidden_state.shape[1], inputs_dict["attention_mask"]
-        )
 
         last_hidden_state_2 = decoder(
             input_ids=inputs_dict["decoder_input_ids"],
             attention_mask=inputs_dict["decoder_attention_mask"],
             encoder_hidden_states=encoder_last_hidden_state,
-            encoder_attention_mask=encoder_attention_mask,
         )[0]
 
         self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
@@ -323,9 +306,7 @@ def _get_input_ids_and_config(self):
             # hack to allow generate for models such as GPT2 as is done in `generate()`
             config.pad_token_id = config.eos_token_id
 
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:max_batch_size, :sequence_length]
-
-        return config, input_ids, attention_mask, max_length
+        return config, input_ids, None, max_length
 
     # not implemented currently
     def test_inputs_embeds(self):
@@ -345,12 +326,11 @@ def test_generate_fp16(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs()
         config.max_target_positions = 400
         input_features = input_dict["input_features"]
-        attention_mask = input_dict["attention_mask"]
         model = WhisperForConditionalGeneration(config).eval().to(torch_device)
         if torch_device == "cuda":
             input_features = input_features.half()
             model.half()
-        model.generate(input_features, attention_mask=attention_mask)
+        model.generate(input_features)
         model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
 
     def test_forward_signature(self):
@@ -364,7 +344,6 @@ def test_forward_signature(self):
 
             expected_arg_names = [
                 "input_features",
-                "attention_mask",
                 "decoder_input_ids",
                 "decoder_attention_mask",
             ]
@@ -625,12 +604,11 @@ def test_generate_without_input_ids(self):
 
     @staticmethod
     def _get_encoder_outputs(
-        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
+        model, input_ids,attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
     ):
         encoder = model.get_encoder()
         encoder_outputs = encoder(
             input_ids,
-            attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
         )
@@ -701,11 +679,10 @@ def _create_and_check_torchscript(self, config, inputs_dict):
             try:
                 model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
                 input_features = inputs["input_features"]
-                attention_mask = inputs["attention_mask"]
                 decoder_input_ids = inputs["decoder_input_ids"]
                 decoder_attention_mask = inputs["decoder_attention_mask"]
                 traced_model = torch.jit.trace(
-                    model, (input_features, attention_mask, decoder_input_ids, decoder_attention_mask)
+                    model, (input_features, decoder_input_ids, decoder_attention_mask)
                 )
             except RuntimeError:
                 self.fail("Couldn't trace module.")
@@ -745,8 +722,6 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
 @require_torch
 @require_torchaudio
-@require_sentencepiece
-@require_tokenizers
 class WhisperModelIntegrationTests(unittest.TestCase):
     @cached_property
     def default_processor(self):
@@ -761,6 +736,7 @@ def _load_datasamples(self, num_samples):
 
         return [x["array"] for x in speech_samples]
 
+    @slow
     def test_tiny_logits_librispeech(self):
         torch_device = "cpu"
         set_seed(0)
@@ -806,6 +782,7 @@ def test_tiny_logits_librispeech(self):
         head_logits = logits[0] @ model.decoder.embed_tokens.weight.T
         self.assertTrue(torch.allclose(head_logits[0, 0, :30].cpu(), EXPECTED_GENERATION, atol=1e-4))
 
+    @slow
     def test_small_en_logits_librispeech(self):
         set_seed(0)
         torch_device = "cpu"
@@ -878,6 +855,7 @@ def test_large_logits_librispeech(self):
 
         self.assertTrue(torch.allclose(logits[0, 0, :30].cpu(), EXPECTED_LOGITS, atol=1e-4))
 
+    @slow
     def test_tiny_en_generation(self):
 
         torch_device = "cpu"
@@ -943,7 +921,7 @@ def test_large_generation(self):
         logits_processor = LogitsProcessorList(
             [
                 SuppressBlank(tokenizer.encode(" "), 50256),
-                SuppressTokens(tokenizer._get_suppress_tokens("-1")),
+                SuppressTokens(model.config.non_speech_tokens),
             ]
         )
         tokenizer.eos_token_id = 50257

From 044e371080ef10c34545a14f7be1cacdb3518419 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 20:19:36 +0000
Subject: [PATCH 086/156] fix large test

---
 tests/models/whisper/test_modeling_whisper.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 7b272e4ce9d77..1589f23483e4b 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -924,11 +924,6 @@ def test_large_generation(self):
                 SuppressTokens(model.config.non_speech_tokens),
             ]
         )
-        tokenizer.eos_token_id = 50257
-        tokenizer.eos_token = "<|endoftext|>"
-        model.config.eos_token_id = 50257
-        model.config.decoder_start_token_id = 50258
-
         decoder_input_ids = torch.tensor([[50258]]).long()
         generated_ids = model.generate(
             input_features,
@@ -938,5 +933,5 @@ def test_large_generation(self):
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we're glad"
+        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)

From 1268f4be7adbf947b57f0d98daee490f718eab1a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 21:16:11 +0000
Subject: [PATCH 087/156] Add multilingual audio test, and translate test

---
 .../whisper/feature_extraction_whisper.py     |  7 +-
 tests/models/whisper/test_modeling_whisper.py | 72 ++++++++++++++++++-
 2 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index f8331f4629fc6..dea610bcd443c 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -158,7 +158,6 @@ def __call__(
         truncation: bool = True,
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        sampling_rate: Optional[int] = None,
         return_attention_mask: Optional[bool] = None,
         padding: Optional[str] = "max_length",
         max_length: Optional[int] = None,
@@ -230,7 +229,7 @@ def __call__(
             max_length=max_length if max_length else self.n_samples,
             truncation=truncation,
             pad_to_multiple_of=pad_to_multiple_of,
-            return_attention_mask=return_attention_mask,
+            return_attention_mask=False,
             **kwargs,
         )
         # make sure list is in array format
@@ -243,10 +242,6 @@ def __call__(
         else:
             padded_inputs["input_features"] = input_features
 
-        attention_mask = np.asarray(padded_inputs.get("attention_mask"))[:, : self.nb_max_frame]
-        if attention_mask is not None:
-            padded_inputs["attention_mask"] = [attention_mask]
-
         if return_tensors is not None:
             padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 1589f23483e4b..d91559a9199d8 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -23,12 +23,17 @@
 from transformers import WhisperConfig
 from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
 from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device
+
+from transformers.utils.import_utils import is_datasets_available
 from transformers.utils import cached_property
 
 from ...generation.test_generation_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 
+if is_datasets_available():
+    import datasets
+    from datasets import load_dataset
 
 if is_torch_available():
     import torch
@@ -719,7 +724,6 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
-
 @require_torch
 @require_torchaudio
 class WhisperModelIntegrationTests(unittest.TestCase):
@@ -728,7 +732,7 @@ def default_processor(self):
         return WhisperProcessor.from_pretrained("openai/whisper-base")
 
     def _load_datasamples(self, num_samples):
-        from datasets import load_dataset
+        
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
@@ -935,3 +939,67 @@ def test_large_generation(self):
 
         EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+
+    @slow
+    def test_large_generation_multilingual(self):
+        torch_device = "cuda"
+        set_seed(0)
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
+        model.to(torch_device)
+
+        ds = load_dataset("common_voice", "ja", split="test", streaming=True)
+        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
+        ds_iter = iter(ds)
+        input_speech = next(ds_iter)["audio"]["array"]
+
+        feaure_extractor = WhisperFeatureExtractor()
+
+        input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
+
+        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
+
+        logits_processor = LogitsProcessorList(
+            [
+                SuppressBlank(tokenizer.encode(" "), 50256),
+                SuppressTokens(model.config.non_speech_tokens),
+            ]
+        )
+        decoder_input_ids = torch.tensor([[50258,50359, 50357, 50363]]).long().to(torch_device)
+        generated_ids = model.generate(
+            input_features,
+            do_sample=True,
+            logits_processor=logits_processor,
+            decoder_input_ids=decoder_input_ids,
+        )
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = "昨日は8時間寝ました"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+        decoder_input_ids = torch.tensor([[50258,50359, 50357]]).long().to(torch_device)
+        generated_ids = model.generate(
+            input_features,
+            do_sample=False,
+            logits_processor=logits_processor,
+            decoder_input_ids=decoder_input_ids,
+        )
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kashite moraimashita."
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+
+        decoder_input_ids = torch.tensor([[50258,50357]]).long().to(torch_device)
+        generated_ids = model.generate(
+            input_features,
+            do_sample=False,
+            logits_processor=logits_processor,
+            decoder_input_ids=decoder_input_ids,
+        )
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = "I borrowed a phone from Kimura san"
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+

From 1578988790cab78fe71baa076d8a30a736edb104 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 21:17:04 +0000
Subject: [PATCH 088/156] style

---
 .../models/whisper/modeling_whisper.py        |  5 +---
 .../models/whisper/tokenization_whisper.py    |  1 -
 tests/models/whisper/test_modeling_whisper.py | 30 +++++++------------
 3 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 1bc72eafa818a..8aa4144e665d4 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -468,8 +468,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
         return input_lengths
 
 
-
-
 WHISPER_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -1239,13 +1237,12 @@ def forward(
         )
 
     def prepare_inputs_for_generation(
-        self, decoder_input_ids,  past=None, use_cache=None, encoder_outputs=None, attention_mask = None, **kwargs
+        self, decoder_input_ids, past=None, use_cache=None, encoder_outputs=None, attention_mask=None, **kwargs
     ):
         # cut decoder_input_ids if past is used
         if past is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
 
-
         return {
             "encoder_outputs": encoder_outputs,
             "past_key_values": past,
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 2142d477cd957..0d726f5ef61a4 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -21,7 +21,6 @@
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
-from .configuration_whisper import NON_SPEECH_TOKENS
 
 
 SPIECE_UNDERLINE = "▁"
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index d91559a9199d8..2794f20361b9d 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -23,14 +23,14 @@
 from transformers import WhisperConfig
 from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
 from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device
-
-from transformers.utils.import_utils import is_datasets_available
 from transformers.utils import cached_property
+from transformers.utils.import_utils import is_datasets_available
 
 from ...generation.test_generation_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 
+
 if is_datasets_available():
     import datasets
     from datasets import load_dataset
@@ -136,7 +136,7 @@ def prepare_config_and_inputs(self):
         config = self.get_config()
         inputs_dict = prepare_whisper_inputs_dict(
             config,
-            attention_mask = None,
+            attention_mask=None,
             input_features=input_features,
             decoder_input_ids=decoder_input_ids,
         )
@@ -234,9 +234,7 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             encoder.save_pretrained(tmpdirname)
             encoder = WhisperEncoder.from_pretrained(tmpdirname).to(torch_device)
 
-        encoder_last_hidden_state_2 = encoder(
-            inputs_dict["input_features"]
-        )[0]
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_features"])[0]
 
         self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
 
@@ -245,7 +243,6 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
             decoder.save_pretrained(tmpdirname)
             decoder = WhisperDecoder.from_pretrained(tmpdirname).to(torch_device)
 
-
         last_hidden_state_2 = decoder(
             input_ids=inputs_dict["decoder_input_ids"],
             attention_mask=inputs_dict["decoder_attention_mask"],
@@ -302,7 +299,6 @@ def _get_input_ids_and_config(self):
 
         # cut to half length & take max batch_size 3
         max_batch_size = 3
-        sequence_length = input_ids.shape[-1] // 2
         input_ids = input_ids[:max_batch_size, :, :]
 
         # generate max 3 tokens
@@ -609,7 +605,7 @@ def test_generate_without_input_ids(self):
 
     @staticmethod
     def _get_encoder_outputs(
-        model, input_ids,attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
+        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
     ):
         encoder = model.get_encoder()
         encoder_outputs = encoder(
@@ -686,9 +682,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
                 input_features = inputs["input_features"]
                 decoder_input_ids = inputs["decoder_input_ids"]
                 decoder_attention_mask = inputs["decoder_attention_mask"]
-                traced_model = torch.jit.trace(
-                    model, (input_features, decoder_input_ids, decoder_attention_mask)
-                )
+                traced_model = torch.jit.trace(model, (input_features, decoder_input_ids, decoder_attention_mask))
             except RuntimeError:
                 self.fail("Couldn't trace module.")
 
@@ -724,6 +718,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
             self.assertTrue(models_equal)
 
+
 @require_torch
 @require_torchaudio
 class WhisperModelIntegrationTests(unittest.TestCase):
@@ -732,7 +727,6 @@ def default_processor(self):
         return WhisperProcessor.from_pretrained("openai/whisper-base")
 
     def _load_datasamples(self, num_samples):
-        
 
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
@@ -940,7 +934,6 @@ def test_large_generation(self):
         EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-
     @slow
     def test_large_generation_multilingual(self):
         torch_device = "cuda"
@@ -965,7 +958,7 @@ def test_large_generation_multilingual(self):
                 SuppressTokens(model.config.non_speech_tokens),
             ]
         )
-        decoder_input_ids = torch.tensor([[50258,50359, 50357, 50363]]).long().to(torch_device)
+        decoder_input_ids = torch.tensor([[50258, 50359, 50357, 50363]]).long().to(torch_device)
         generated_ids = model.generate(
             input_features,
             do_sample=True,
@@ -977,7 +970,7 @@ def test_large_generation_multilingual(self):
         EXPECTED_TRANSCRIPT = "昨日は8時間寝ました"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-        decoder_input_ids = torch.tensor([[50258,50359, 50357]]).long().to(torch_device)
+        decoder_input_ids = torch.tensor([[50258, 50359, 50357]]).long().to(torch_device)
         generated_ids = model.generate(
             input_features,
             do_sample=False,
@@ -989,8 +982,7 @@ def test_large_generation_multilingual(self):
         EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kashite moraimashita."
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-
-        decoder_input_ids = torch.tensor([[50258,50357]]).long().to(torch_device)
+        decoder_input_ids = torch.tensor([[50258, 50357]]).long().to(torch_device)
         generated_ids = model.generate(
             input_features,
             do_sample=False,
@@ -1001,5 +993,3 @@ def test_large_generation_multilingual(self):
 
         EXPECTED_TRANSCRIPT = "I borrowed a phone from Kimura san"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
-
-

From 8387ce844065279aac6ba5d0ae0dd2f2c37fe38d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 21:20:50 +0000
Subject: [PATCH 089/156] fix larg multilingual test

---
 tests/models/whisper/test_modeling_whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 2794f20361b9d..218513adcac5f 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -958,7 +958,7 @@ def test_large_generation_multilingual(self):
                 SuppressTokens(model.config.non_speech_tokens),
             ]
         )
-        decoder_input_ids = torch.tensor([[50258, 50359, 50357, 50363]]).long().to(torch_device)
+        decoder_input_ids = torch.tensor([[50258, 50359, 50266, 50363]]).long().to(torch_device)
         generated_ids = model.generate(
             input_features,
             do_sample=True,
@@ -967,7 +967,7 @@ def test_large_generation_multilingual(self):
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = "昨日は8時間寝ました"
+        EXPECTED_TRANSCRIPT = "昨日は8時間寝ました。 "
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
         decoder_input_ids = torch.tensor([[50258, 50359, 50357]]).long().to(torch_device)

From 6b14b674b6db76d1c6c91c3db4e9be98b1058b11 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 21:28:07 +0000
Subject: [PATCH 090/156] nits

---
 tests/models/whisper/test_modeling_whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 218513adcac5f..58f8c044366fa 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -936,7 +936,7 @@ def test_large_generation(self):
 
     @slow
     def test_large_generation_multilingual(self):
-        torch_device = "cuda"
+        torch_device = "cpu"
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
         model.to(torch_device)
@@ -967,7 +967,7 @@ def test_large_generation_multilingual(self):
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = "昨日は8時間寝ました。 "
+        EXPECTED_TRANSCRIPT = " 木村さんに電話を貸してもらいましょう。 I"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
         decoder_input_ids = torch.tensor([[50258, 50359, 50357]]).long().to(torch_device)

From cafe5f10ae9a09f74e6f2e3c4486a2d6883a60a3 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 28 Sep 2022 23:43:48 +0200
Subject: [PATCH 091/156] Update docs/source/en/model_doc/whisper.mdx

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/en/model_doc/whisper.mdx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
index 711738d83a354..6d8287dd97db7 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -15,7 +15,6 @@ specific language governing permissions and limitations under the License.
 ## Overview
 
 The Whisper model was proposed in [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
-<INSERT SHORT SUMMARY HERE>
 
 The abstract from the paper is the following:
 

From bbf35b11f4a5254b6b8476ec88e50ee65c5ba335 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 21:49:57 +0000
Subject: [PATCH 092/156] add copied from for attention layer

---
 src/transformers/models/whisper/modeling_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 8aa4144e665d4..ade424c7ec433 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -103,7 +103,7 @@ def forward(self, input_ids, past_key_values_length=0):
 
         return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]]
 
-
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Whisper
 class WhisperAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 

From 40284ef526563a4488f4182a4d62429f2297562d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 21:51:26 +0000
Subject: [PATCH 093/156] remove attention masks in doc

---
 .../models/whisper/modeling_whisper.py        | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ade424c7ec433..bd6220b82f7d7 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -492,14 +492,6 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
             via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
             [`WhisperFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
             tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
-            1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
@@ -623,14 +615,6 @@ def forward(
                 `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting the fbank features,
                 padding and conversion into a tensor of type `torch.FloatTensor`. See
                 [`~WhisperFeatureExtractor.__call__`]
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
-                `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
             head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                 Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
 
@@ -804,14 +788,6 @@ def forward(
             encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                 Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                 of the decoder.
-            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
-                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
-                selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
             head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                 Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
 

From ebb79e9149a64bfcdaec1f58cad3dfc91958a1f6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 22:13:24 +0000
Subject: [PATCH 094/156] add english normalizer

---
 .../models/whisper/english_normalizer.py      | 69 ++++++++++++++++++-
 .../models/whisper/tokenization_whisper.py    | 45 +++++++++++-
 2 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index d3abd03975c36..ce493c78df425 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -20,7 +20,74 @@
 
 from more_itertools import windowed
 
-from .basic import remove_symbols_and_diacritics
+import unicodedata
+
+# non-ASCII letters that are not separated by "NFKD" normalization
+ADDITIONAL_DIACRITICS = {
+    "œ": "oe",
+    "Œ": "OE",
+    "ø": "o",
+    "Ø": "O",
+    "æ": "ae",
+    "Æ": "AE",
+    "ß": "ss",
+    "ẞ": "SS",
+    "đ": "d",
+    "Đ": "D",
+    "ð": "d",
+    "Ð": "D",
+    "þ": "th",
+    "Þ": "th",
+    "ł": "l",
+    "Ł": "L",
+}
+
+
+def remove_symbols_and_diacritics(s: str, keep=""):
+    """
+    Replace any other markers, symbols, and punctuations with a space,
+    and drop any diacritics (category 'Mn' and some manual mappings)
+    """
+    return "".join(
+        c
+        if c in keep
+        else ADDITIONAL_DIACRITICS[c]
+        if c in ADDITIONAL_DIACRITICS
+        else ""
+        if unicodedata.category(c) == "Mn"
+        else " "
+        if unicodedata.category(c)[0] in "MSP"
+        else c
+        for c in unicodedata.normalize("NFKD", s)
+    )
+
+
+def remove_symbols(s: str):
+    """
+    Replace any other markers, symbols, punctuations with a space, keeping diacritics
+    """
+    return "".join(
+        " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
+    )
+
+
+class BasicTextNormalizer:
+    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
+        self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
+        self.split_letters = split_letters
+
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = self.clean(s).lower()
+
+        if self.split_letters:
+            s = " ".join(regex.findall(r"\X", s, regex.U))
+
+        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
+
+        return s
 
 
 class EnglishNumberNormalizer:
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 0d726f5ef61a4..70631912bdb8b 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -15,13 +15,14 @@
 """Tokenization classes for Whisper."""
 import json
 import os
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import regex as re
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
+from .english_normalizer import EnglishTextNormalizer
 
 SPIECE_UNDERLINE = "▁"
 
@@ -418,6 +419,48 @@ def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         return self.decoder.get(index, self.decoder.get(self.unk_token_id))
 
+    def _normalize(self, text):
+        normalizer = EnglishTextNormalizer()
+        return normalizer(text)
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        normalize: bool = False,
+        **kwargs
+    ) -> str:
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+
+        text = "".join(sub_texts)
+
+        if normalize:
+            clean_text = self._normalize(text)
+            return clean_text
+        else:
+            return text
+
+
     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string with GPT2 -> Whisper
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""

From 1b6a09c4cd7adfeb453dff20ebcc5034d94c3dc0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 28 Sep 2022 22:16:27 +0000
Subject: [PATCH 095/156] update tokenization test

---
 tests/models/whisper/test_tokenization_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 97dbf914044f2..3c1deb2b7d99e 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -143,7 +143,7 @@ def test_tokenizer_equivalence(self):
 
     def test_tokenizer_special(self):
         multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
-        text = "<|startoftranscript|> Hey! How are you feeling? J'ai l'impression que 郷さん est prêt<|endoftext|>"
+        text = "<|startoftranscript|>Hey! How are you feeling? J'ai l'impression que 郷さん est prêt<|endoftext|>"
 
         multilingual_tokens = multilingual_tokenizer.encode(text)
 

From 5ca9dcb7395717a5cb044e8f4aeff6dcf2f8887b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 09:01:38 +0000
Subject: [PATCH 096/156] remove copied from in whisper attention : no bias in
 k_proj only

---
 src/transformers/models/whisper/modeling_whisper.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index bd6220b82f7d7..f69e567907115 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -103,7 +103,6 @@ def forward(self, input_ids, past_key_values_length=0):
 
         return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]]
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Whisper
 class WhisperAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 

From d0cf6609c187f6ebdc45d66876687eb9df915db0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 13:04:47 +0000
Subject: [PATCH 097/156] wrap around dependencies in english normalizer

---
 .../models/whisper/english_normalizer.py            | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index ce493c78df425..3f052ba75eeda 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -18,10 +18,17 @@
 from fractions import Fraction
 from typing import Iterator, List, Match, Optional, Union
 
-from more_itertools import windowed
+from transformers.utils.import_utils import is_more_itertools_available
+
+
+if is_more_itertools_available():
+    from more_itertools import windowed
 
 import unicodedata
 
+import regex
+
+
 # non-ASCII letters that are not separated by "NFKD" normalization
 ADDITIONAL_DIACRITICS = {
     "œ": "oe",
@@ -45,8 +52,8 @@
 
 def remove_symbols_and_diacritics(s: str, keep=""):
     """
-    Replace any other markers, symbols, and punctuations with a space,
-    and drop any diacritics (category 'Mn' and some manual mappings)
+    Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
+    manual mappings)
     """
     return "".join(
         c

From 378b84b9cfdc790e2a02eb4bb00eb99450b74aa0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 13:05:03 +0000
Subject: [PATCH 098/156] style

---
 src/transformers/models/whisper/tokenization_whisper.py | 2 +-
 src/transformers/utils/import_utils.py                  | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 70631912bdb8b..902e89cabdc54 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -21,9 +21,9 @@
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
-
 from .english_normalizer import EnglishTextNormalizer
 
+
 SPIECE_UNDERLINE = "▁"
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json", "merges_file": "merges.txt"}
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index f2cf5ffd9bff4..afc182be732a2 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -451,6 +451,10 @@ def is_detectron2_available():
     return _detectron2_available
 
 
+def is_more_itertools_available():
+    return importlib.util.find_spec("more_itertools") is not None
+
+
 def is_rjieba_available():
     return importlib.util.find_spec("rjieba") is not None
 

From a71ead9d59b942381fb0f10b51367f3f58db0f1d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 13:40:22 +0000
Subject: [PATCH 099/156] correct import generation logits

---
 tests/models/whisper/test_modeling_whisper.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 58f8c044366fa..5ff12e2b0a6f6 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -21,7 +21,6 @@
 import unittest
 
 from transformers import WhisperConfig
-from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
 from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device
 from transformers.utils import cached_property
 from transformers.utils.import_utils import is_datasets_available
@@ -37,7 +36,7 @@
 
 if is_torch_available():
     import torch
-
+    from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
     from transformers import (
         WhisperFeatureExtractor,
         WhisperForConditionalGeneration,
@@ -979,7 +978,7 @@ def test_large_generation_multilingual(self):
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kashite moraimashita."
+        EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kaite moraimashita."
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
         decoder_input_ids = torch.tensor([[50258, 50357]]).long().to(torch_device)

From bdc12590dd97ae0bfdbf942b06c94f60ff05d2a4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 14:17:49 +0000
Subject: [PATCH 100/156] for now, wrap feature extractor with torch

---
 src/transformers/models/whisper/feature_extraction_whisper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index dea610bcd443c..6ea7c6849f009 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 from transformers import is_torch_available
+from transformers.testing_utils import require_torch
 
 
 if is_torch_available():
@@ -33,7 +34,7 @@
 
 logger = logging.get_logger(__name__)
 
-
+@require_torch
 class WhisperFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Whisper feature extractor.

From bd99c2389c7b97e41729e70fd6d396979f4e519f Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 29 Sep 2022 16:59:08 +0200
Subject: [PATCH 101/156] Update
 src/transformers/models/whisper/convert_openai_whisper_to_tfms.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 .../models/whisper/convert_openai_whisper_to_tfms.py             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 3c637c92e0053..c7a4e143ad9d1 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Convert Whisper checkpoints from the original repository. URL: https://github.com/openai/whisper"""
 
 import argparse
 import hashlib

From e204a5123c8cac8f32f708a05bd3f0a19a289924 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 29 Sep 2022 16:59:17 +0200
Subject: [PATCH 102/156] Update
 src/transformers/models/whisper/configuration_whisper.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/whisper/configuration_whisper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index e8b6b79964fcd..b5637b904d1cb 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -125,6 +125,7 @@ class WhisperConfig(PretrainedConfig):
         non_speech_tokens (`List[int]`, *optional*, defaults to None):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
             function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here.
+
     Example:
 
     ```python

From 7fa70dbf8ab492545042417e03eee822ea2637ab Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 29 Sep 2022 16:59:25 +0200
Subject: [PATCH 103/156] Update docs/source/en/model_doc/whisper.mdx

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/whisper.mdx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
index 6d8287dd97db7..8186ed22d5186 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -24,7 +24,8 @@ The abstract from the paper is the following:
 Tips:
 
 - The model usually performs well without requiring any finetuning. 
-- The architecture follows a classic Encoder/Decoder architecture, which means that it relies on the [`generate`](https://huggingface.co/docs/transformers/v4.22.2/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate) function 
+- The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`generate`](https://huggingface.co/docs/transformers/v4.22.2/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate) function for inference.
+- One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text.
 
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
 The original code can be found [here](https://github.com/openai/whisper).

From 5d62a99d2eba848c35c8f2bdd4241696e596cc18 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 15:04:38 +0000
Subject: [PATCH 104/156] remove torch depencies for feature extraction and
 style

---
 .../whisper/feature_extraction_whisper.py     | 90 ++++++++++++++-----
 tests/models/whisper/test_modeling_whisper.py |  3 +-
 2 files changed, 72 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 6ea7c6849f009..3ef057ad0ff42 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -19,14 +19,10 @@
 from typing import List, Optional, Union
 
 import numpy as np
+from numpy.fft import fft
 
-from transformers import is_torch_available
-from transformers.testing_utils import require_torch
 
 
-if is_torch_available():
-    import torch
-
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
@@ -34,7 +30,7 @@
 
 logger = logging.get_logger(__name__)
 
-@require_torch
+
 class WhisperFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Whisper feature extractor.
@@ -132,27 +128,81 @@ def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
 
         return weights
 
-    def _extract_fbank_features(
-        self,
-        waveform: np.ndarray,
-    ) -> np.ndarray:
+    def fram_wave(self, waveform, center = True):
+        frames = []
+        for i in range(0,waveform.shape[0], self.hop_length) : 
+            half_window = (self.n_fft - 1 )//2 +1
+            if center == True : 
+                start = i-half_window if i > half_window else 0
+                end = i+half_window  if i < waveform.shape[0] - half_window else waveform.shape[0]
+
+                frame = waveform[start:end]
+
+                if start == 0 : 
+                    padd_width = (-i+half_window, 0)
+                    frame = np.pad(frame, pad_width=padd_width, mode = "reflect") 
+
+                elif end == waveform.shape[0] : 
+                    padd_width = (0, (i - waveform.shape[0] + half_window))
+                    frame = np.pad(frame, pad_width=padd_width, mode = "reflect") 
+                
+            else : 
+                frame = waveform[i : i + self.n_fft ]
+                frame_width = frame.shape[0]
+                if frame_width < waveform.shape[0] :
+                    frame = np.lib.pad(frame, pad_width=(0,self.n_fft - frame_width), mode = "constant", constant_values=0) 
+
+            frames.append(frame)
+        return np.stack(frames,0)
+
+    def stft(self, frames, window):
+        """
+        Calculates the complex Short-Time Fourier Transform (STFT) of the given
+        framed signal. Should give the same results as torch.stft
+        """
+        frame_size = frames.shape[1]
+        fft_size = self.n_fft
+
+        if fft_size is None:
+            fft_size = frame_size
+
+        if fft_size < frame_size:
+            raise ValueError('FFT size must greater or equal the frame size')
+        # number of FFT bins to store
+        num_fft_bins = (fft_size >> 1) + 1
+
+        data = np.empty((len(frames), num_fft_bins), dtype = np.complex64)
+        fft_signal = np.zeros(fft_size)
+
+        for f, frame in enumerate(frames):
+            if window is not None:
+                np.multiply(frame, window, out=fft_signal[:frame_size])
+            else:
+                fft_signal[:frame_size] = frame
+            data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
+        return data.T
+
+    def _np_extract_fbank_features(self,waveform : np.array) -> np.ndarray : 
         """
-        Compute the log-Mel spectrogram of the provided audio
+        Compute the log-Mel spectrogram of the provided audio, gives similar 
+        results to a torch implementation at 1e-5 tolerance. 
         """
-        waveform = torch.from_numpy(waveform)
-        window = torch.hann_window(self.n_fft).to(waveform.device)
-        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
-        magnitudes = stft[:, :-1].abs() ** 2
+        window = np.hanning(self.n_fft + 1)[:-1]
+
+        frames = self.fram_wave(waveform)
+        stft = self.stft(frames, window=window)
+        magnitudes = (np.abs(stft[:, :-1]) ** 2)
 
-        filters = torch.from_numpy(self.mel_filters)
+        filters = self.mel_filters
         mel_spec = filters @ magnitudes
 
-        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
-        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = np.log10(np.clip(mel_spec,a_min=1e-10, a_max = None))
+        log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
         log_spec = (log_spec + 4.0) / 4.0
 
         return log_spec
 
+
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
@@ -236,9 +286,9 @@ def __call__(
         # make sure list is in array format
         input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
 
-        input_features = [self._extract_fbank_features(waveform) for waveform in input_features[0]]
+        input_features = [self._np_extract_fbank_features(waveform) for waveform in input_features[0]]
 
-        if isinstance(input_features[0], torch.Tensor) or isinstance(input_features[0], List):
+        if isinstance(input_features[0], List):
             padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
         else:
             padded_inputs["input_features"] = input_features
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 5ff12e2b0a6f6..a4febf5e71250 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -36,7 +36,7 @@
 
 if is_torch_available():
     import torch
-    from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
+
     from transformers import (
         WhisperFeatureExtractor,
         WhisperForConditionalGeneration,
@@ -45,6 +45,7 @@
         WhisperTokenizer,
         set_seed,
     )
+    from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
     from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder
 
 

From f44801571001da49875f57e1f8b68a1a60a332f3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 15:05:07 +0000
Subject: [PATCH 105/156] fixup

---
 .../models/whisper/english_normalizer.py      |  4 +-
 .../whisper/feature_extraction_whisper.py     | 53 +++++++++----------
 .../models/whisper/modeling_whisper.py        |  1 +
 .../models/whisper/tokenization_whisper.py    |  7 +--
 4 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index 3f052ba75eeda..d9570537cea0e 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -73,9 +73,7 @@ def remove_symbols(s: str):
     """
     Replace any other markers, symbols, punctuations with a space, keeping diacritics
     """
-    return "".join(
-        " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
-    )
+    return "".join(" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s))
 
 
 class BasicTextNormalizer:
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 3ef057ad0ff42..3689b4b15f9fb 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -21,8 +21,6 @@
 import numpy as np
 from numpy.fft import fft
 
-
-
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
 from ...utils import TensorType, logging
@@ -128,32 +126,34 @@ def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
 
         return weights
 
-    def fram_wave(self, waveform, center = True):
+    def fram_wave(self, waveform, center=True):
         frames = []
-        for i in range(0,waveform.shape[0], self.hop_length) : 
-            half_window = (self.n_fft - 1 )//2 +1
-            if center == True : 
-                start = i-half_window if i > half_window else 0
-                end = i+half_window  if i < waveform.shape[0] - half_window else waveform.shape[0]
+        for i in range(0, waveform.shape[0], self.hop_length):
+            half_window = (self.n_fft - 1) // 2 + 1
+            if center == True:
+                start = i - half_window if i > half_window else 0
+                end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
 
                 frame = waveform[start:end]
 
-                if start == 0 : 
-                    padd_width = (-i+half_window, 0)
-                    frame = np.pad(frame, pad_width=padd_width, mode = "reflect") 
+                if start == 0:
+                    padd_width = (-i + half_window, 0)
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
 
-                elif end == waveform.shape[0] : 
+                elif end == waveform.shape[0]:
                     padd_width = (0, (i - waveform.shape[0] + half_window))
-                    frame = np.pad(frame, pad_width=padd_width, mode = "reflect") 
-                
-            else : 
-                frame = waveform[i : i + self.n_fft ]
+                    frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+            else:
+                frame = waveform[i : i + self.n_fft]
                 frame_width = frame.shape[0]
-                if frame_width < waveform.shape[0] :
-                    frame = np.lib.pad(frame, pad_width=(0,self.n_fft - frame_width), mode = "constant", constant_values=0) 
+                if frame_width < waveform.shape[0]:
+                    frame = np.lib.pad(
+                        frame, pad_width=(0, self.n_fft - frame_width), mode="constant", constant_values=0
+                    )
 
             frames.append(frame)
-        return np.stack(frames,0)
+        return np.stack(frames, 0)
 
     def stft(self, frames, window):
         """
@@ -167,11 +167,11 @@ def stft(self, frames, window):
             fft_size = frame_size
 
         if fft_size < frame_size:
-            raise ValueError('FFT size must greater or equal the frame size')
+            raise ValueError("FFT size must greater or equal the frame size")
         # number of FFT bins to store
         num_fft_bins = (fft_size >> 1) + 1
 
-        data = np.empty((len(frames), num_fft_bins), dtype = np.complex64)
+        data = np.empty((len(frames), num_fft_bins), dtype=np.complex64)
         fft_signal = np.zeros(fft_size)
 
         for f, frame in enumerate(frames):
@@ -182,27 +182,26 @@ def stft(self, frames, window):
             data[f] = fft(fft_signal, axis=0)[:num_fft_bins]
         return data.T
 
-    def _np_extract_fbank_features(self,waveform : np.array) -> np.ndarray : 
+    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided audio, gives similar 
-        results to a torch implementation at 1e-5 tolerance. 
+        Compute the log-Mel spectrogram of the provided audio, gives similar
+        results to a torch implementation at 1e-5 tolerance.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
 
         frames = self.fram_wave(waveform)
         stft = self.stft(frames, window=window)
-        magnitudes = (np.abs(stft[:, :-1]) ** 2)
+        magnitudes = np.abs(stft[:, :-1]) ** 2
 
         filters = self.mel_filters
         mel_spec = filters @ magnitudes
 
-        log_spec = np.log10(np.clip(mel_spec,a_min=1e-10, a_max = None))
+        log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
         log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
         log_spec = (log_spec + 4.0) / 4.0
 
         return log_spec
 
-
     def __call__(
         self,
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index f69e567907115..768e1d18f3349 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -103,6 +103,7 @@ def forward(self, input_ids, past_key_values_length=0):
 
         return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]]
 
+
 class WhisperAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 902e89cabdc54..46ed902b23293 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -424,11 +424,7 @@ def _normalize(self, text):
         return normalizer(text)
 
     def _decode(
-        self,
-        token_ids: Union[int, List[int]],
-        skip_special_tokens: bool = False,
-        normalize: bool = False,
-        **kwargs
+        self, token_ids: Union[int, List[int]], skip_special_tokens: bool = False, normalize: bool = False, **kwargs
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
@@ -460,7 +456,6 @@ def _decode(
         else:
             return text
 
-
     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string with GPT2 -> Whisper
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""

From 62b2572bfda368ebeb189ad66cb9686b92cdf2dc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 15:05:52 +0000
Subject: [PATCH 106/156] nit

---
 .../models/whisper/feature_extraction_whisper.py       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 3689b4b15f9fb..440da66f57377 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -130,7 +130,7 @@ def fram_wave(self, waveform, center=True):
         frames = []
         for i in range(0, waveform.shape[0], self.hop_length):
             half_window = (self.n_fft - 1) // 2 + 1
-            if center == True:
+            if center:
                 start = i - half_window if i > half_window else 0
                 end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
 
@@ -157,8 +157,8 @@ def fram_wave(self, waveform, center=True):
 
     def stft(self, frames, window):
         """
-        Calculates the complex Short-Time Fourier Transform (STFT) of the given
-        framed signal. Should give the same results as torch.stft
+        Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
+        results as torch.stft
         """
         frame_size = frames.shape[1]
         fft_size = self.n_fft
@@ -184,8 +184,8 @@ def stft(self, frames, window):
 
     def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided audio, gives similar
-        results to a torch implementation at 1e-5 tolerance.
+        Compute the log-Mel spectrogram of the provided audio, gives similar results to a torch implementation at 1e-5
+        tolerance.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
 

From 351c9428fee7639ea4fae3b102029dd9a0b0ef39 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 15:20:05 +0000
Subject: [PATCH 107/156] update logitds

---
 tests/models/whisper/test_modeling_whisper.py | 47 ++++++++++---------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index a4febf5e71250..556198ef03b07 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -16,6 +16,7 @@
 
 import copy
 import inspect
+from locale import normalize
 import os
 import tempfile
 import unittest
@@ -757,10 +758,10 @@ def test_tiny_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.9892, -6.7607, 5.7348, 3.6095, 0.2152, -5.7321, 4.8855, -1.6407,
-                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
-                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3714,
-                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
+                2.9547, -6.7057, 5.6948, 3.6060, 0.2028, -5.7131, 4.8454, -1.8480,
+                0.2464, -1.3995, 10.3491, 3.3373, 0.0177, -7.9847, 3.5646, 8.4769,
+                4.0122, -2.3344, 11.2626, 1.0067, 0.9832, -8.6476, -3.3424, -9.3303,
+                1.1144, 3.4940, 7.2391, -5.2304, -1.5814, 10.5482
             ]
         )
         # fmt: on
@@ -769,10 +770,10 @@ def test_tiny_logits_librispeech(self):
         # fmt: off
         EXPECTED_GENERATION = torch.tensor(
             [
-                -1.4651, -2.6944, 2.7821, 2.3793, 4.0738, 0.0188, -3.3204, 1.9836,
-                0.0520, 0.7095, 1.1063, 0.2952, -3.6786, -0.5249, 0.3105, 4.7691,
-                1.1562, 1.3046, 0.5810, -0.3624, 1.7006, 1.3424, 0.9817, 2.1958,
-                1.8775, -5.7046, -0.7679, 4.0113, 2.6848, 2.8609
+                -1.4729, -2.7544, 2.7368, 2.3457, 4.0224, -0.0156, -3.3636, 1.9609,
+                0.0326, 0.6874, 1.0637, 0.2784, -3.7079, -0.5307, 0.2900, 4.7735,
+                1.1159, 1.2945, 0.5803, -0.3822, 1.6661, 1.2853, 0.9415, 2.1819,
+                1.8381, -5.7385, -0.7763, 3.9704, 2.6306, 2.8336
             ]
         )
         # fmt: on
@@ -805,11 +806,11 @@ def test_small_en_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                -3.6784, -7.7212, -9.5070, -11.9286, -7.6489, -9.7026, -5.6188,
-                -8.0104, -4.6239, -5.1833, -9.0485, -3.4079, -5.4874, -2.6935,
-                -6.3479, -7.3398, -6.9558, -7.6867, -7.4748, -8.3463, -9.9781,
-                -10.8389, -10.3105, -11.7201, -9.7261, -7.1590, -5.9272, -12.4509,
-                -11.1147, -8.1918
+                -5.7431, -9.3231, -10.7072, -13.4309, -9.2928, -11.4487, -7.4086,
+                -9.8974, -6.1540, -6.8334, -10.7648, -5.6510, -7.5492, -5.1742,
+                -8.3994, -9.3752, -8.7796, -9.2042, -9.4592, -10.1895, -11.8376,
+                -12.6912, -12.5018, -13.2994, -11.4461, -8.8825, -7.9569, -13.6898,
+                -12.6563, -9.7243
             ]
         )
         # fmt: on
@@ -843,10 +844,10 @@ def test_large_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.1382, 0.9381, 4.4671, 3.5589, 2.4022, 3.8577, -0.6521, 2.5472,
-                1.8301, 1.9957, 2.3432, 1.4678, 0.5459, 2.2597, 1.5179, 2.5357,
-                1.1624, 0.6194, 1.0757, 1.8259, 2.4076, 1.6601, 2.3503, 1.3376,
-                1.9891, 1.8635, 3.8931, 5.3699, 4.4772, 3.9184
+                1.8844, 0.8033, 4.4131, 3.5382, 2.3053, 3.8265, -0.7464, 2.4677,
+                1.7290, 1.8508, 2.2446, 1.1164, 0.3844, 1.9060, 1.4199, 2.4646,
+                1.1612, 0.6382, 1.0921, 1.8465, 2.3622, 1.6158, 2.3126, 1.2661,
+                1.9403, 1.7156, 3.7835, 5.3524, 4.3426, 3.8247
             ]
         )
         # fmt: on
@@ -869,7 +870,7 @@ def test_tiny_en_generation(self):
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
         generated_ids = model.generate(input_features, num_beams=5, forced_bos_token_id=50362)
-        transcript = tokenizer.batch_decode(generated_ids)[0]
+        transcript = tokenizer.batch_decode(generated_ids, normalize = False)[0]
 
         EXPECTED_TRANSCRIPT = (
             "<|startoftranscript|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
@@ -894,7 +895,7 @@ def test_tiny_generation(self):
 
         decoder_input_ids = torch.tensor([[50258]]).long()
         generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids=decoder_input_ids)
-        transcript = tokenizer.decode(generated_ids[0])
+        transcript = tokenizer.decode(generated_ids[0], normalize = False)
 
         EXPECTED_TRANSCRIPT = (
             "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
@@ -929,7 +930,7 @@ def test_large_generation(self):
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, normalize = False)[0]
 
         EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -965,7 +966,7 @@ def test_large_generation_multilingual(self):
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, normalize = False)[0]
 
         EXPECTED_TRANSCRIPT = " 木村さんに電話を貸してもらいましょう。 I"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -977,7 +978,7 @@ def test_large_generation_multilingual(self):
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, normalize = False)[0]
 
         EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kaite moraimashita."
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -989,7 +990,7 @@ def test_large_generation_multilingual(self):
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, normalize = False)[0]
 
         EXPECTED_TRANSCRIPT = "I borrowed a phone from Kimura san"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)

From 6adeabe0ebe2ecce97d9ab6df84fb2c3a91dd0bc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 15:20:44 +0000
Subject: [PATCH 108/156] style

---
 tests/models/whisper/test_modeling_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 556198ef03b07..9b0bbcf8932f0 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -16,10 +16,10 @@
 
 import copy
 import inspect
-from locale import normalize
 import os
 import tempfile
 import unittest
+from locale import normalize
 
 from transformers import WhisperConfig
 from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device

From 4b07c61617578a058dee74d729246c48a124f02a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 15:21:42 +0000
Subject: [PATCH 109/156] nit

---
 tests/models/whisper/test_modeling_whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 9b0bbcf8932f0..07c7c8228b77a 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -873,7 +873,7 @@ def test_tiny_en_generation(self):
         transcript = tokenizer.batch_decode(generated_ids, normalize = False)[0]
 
         EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            "<|startoftranscript|><|notimestamps|>  Mr. Quilter is the apostle of the middle"
             " classes, and we are glad to"
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -898,7 +898,7 @@ def test_tiny_generation(self):
         transcript = tokenizer.decode(generated_ids[0], normalize = False)
 
         EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>  Mr. Quilter is the apostle of the middle"
             " classes and we are glad"
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)

From a276b0780ad30e4ce2684f464406cdc5034274d4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 16:51:28 +0000
Subject: [PATCH 110/156] nits and fix final tests

---
 .../models/whisper/configuration_whisper.py   |  4 +-
 .../whisper/feature_extraction_whisper.py     |  4 +-
 .../test_feature_extraction_whisper.py        |  4 +-
 tests/models/whisper/test_modeling_whisper.py | 53 +++++++++----------
 4 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index b5637b904d1cb..2ea3eeda4d1af 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -156,7 +156,7 @@ def __init__(
         encoder_ffn_dim=1536,
         encoder_layerdrop=0.0,
         decoder_layerdrop=0.0,
-        decoder_start_token_id=50258,
+        decoder_start_token_id=50257,
         use_cache=True,
         is_encoder_decoder=True,
         activation_function="gelu",
@@ -170,7 +170,7 @@ def __init__(
         max_target_positions=448,
         pad_token_id=0,
         bos_token_id=50257,
-        eos_token_id=50257,
+        eos_token_id=50256,
         tie_word_embeddings=True,
         non_speech_tokens=None,
         **kwargs
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 440da66f57377..eb70c7ee4a5bd 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -73,7 +73,7 @@ def __init__(
         self.chunk_length = chunk_length
         self.return_attention_mask = True
         self.n_samples = chunk_length * sampling_rate
-        self.nb_max_frame = self.n_samples // hop_length
+        self.nb_max_frames = self.n_samples // hop_length
         self.sampling_rate = sampling_rate
         self.mel_filters = self.get_mel_filters(sampling_rate, n_fft, n_mels=feature_size)
 
@@ -128,7 +128,7 @@ def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
 
     def fram_wave(self, waveform, center=True):
         frames = []
-        for i in range(0, waveform.shape[0], self.hop_length):
+        for i in range(0, waveform.shape[0]+1, self.hop_length):
             half_window = (self.n_fft - 1) // 2 + 1
             if center:
                 start = i - half_window if i > half_window else 0
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index 67ba729cae8c5..c67cab7820016 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -63,7 +63,7 @@ def __init__(
         max_seq_length=2000,
         feature_size=10,
         hop_length=160,
-        chunk_length=5,
+        chunk_length=8,
         padding_value=0.0,
         sampling_rate=4_000,
         return_attention_mask=True,
@@ -159,7 +159,7 @@ def test_call(self):
         # Test feature size
         input_features = feature_extractor(np_speech_inputs, padding="max_length", return_tensors="np").input_features
         self.assertTrue(input_features.ndim == 3)
-        self.assertTrue(input_features.shape[-1] == feature_extractor.nb_max_frame)
+        self.assertTrue(input_features.shape[-1] == feature_extractor.nb_max_frames)
         self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
 
         # Test not batched input
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 07c7c8228b77a..3c011736be28a 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -19,7 +19,6 @@
 import os
 import tempfile
 import unittest
-from locale import normalize
 
 from transformers import WhisperConfig
 from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device
@@ -758,10 +757,10 @@ def test_tiny_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.9547, -6.7057, 5.6948, 3.6060, 0.2028, -5.7131, 4.8454, -1.8480,
-                0.2464, -1.3995, 10.3491, 3.3373, 0.0177, -7.9847, 3.5646, 8.4769,
-                4.0122, -2.3344, 11.2626, 1.0067, 0.9832, -8.6476, -3.3424, -9.3303,
-                1.1144, 3.4940, 7.2391, -5.2304, -1.5814, 10.5482
+                2.9892, -6.7607,  5.7348,  3.6096,  0.2152, -5.7321,  4.8855, -1.6407,
+                0.2823, -1.5718, 10.4269,  3.4427,  0.0219, -8.0612,  3.4784,  8.4246,
+                4.0575, -2.2864, 11.1084,  0.9963,  0.9884, -8.5154, -3.5469, -9.3713,
+                0.9786,  3.5435,  7.4850, -5.2579, -1.4366, 10.4841
             ]
         )
         # fmt: on
@@ -770,10 +769,10 @@ def test_tiny_logits_librispeech(self):
         # fmt: off
         EXPECTED_GENERATION = torch.tensor(
             [
-                -1.4729, -2.7544, 2.7368, 2.3457, 4.0224, -0.0156, -3.3636, 1.9609,
-                0.0326, 0.6874, 1.0637, 0.2784, -3.7079, -0.5307, 0.2900, 4.7735,
-                1.1159, 1.2945, 0.5803, -0.3822, 1.6661, 1.2853, 0.9415, 2.1819,
-                1.8381, -5.7385, -0.7763, 3.9704, 2.6306, 2.8336
+                -1.4651, -2.6944,  2.7821,  2.3793,  4.0738,  0.0188, -3.3203,  1.9836,
+                0.0520,  0.7095,  1.1063,  0.2952, -3.6786, -0.5249,  0.3105,  4.7691,
+                1.1562,  1.3046,  0.5810, -0.3624,  1.7006,  1.3424,  0.9817,  2.1958,
+                1.8775, -5.7046, -0.7679,  4.0113,  2.6848,  2.8609
             ]
         )
         # fmt: on
@@ -806,11 +805,11 @@ def test_small_en_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                -5.7431, -9.3231, -10.7072, -13.4309, -9.2928, -11.4487, -7.4086,
-                -9.8974, -6.1540, -6.8334, -10.7648, -5.6510, -7.5492, -5.1742,
-                -8.3994, -9.3752, -8.7796, -9.2042, -9.4592, -10.1895, -11.8376,
-                -12.6912, -12.5018, -13.2994, -11.4461, -8.8825, -7.9569, -13.6898,
-                -12.6563, -9.7243
+                -3.6784,  -7.7211,  -9.5070, -11.9286,  -7.6489,  -9.7026,  -5.6188,
+                -8.0104,  -4.6238,  -5.1833,  -9.0485,  -3.4079,  -5.4874,  -2.6935,
+                -6.3479,  -7.3398,  -6.9558,  -7.6867,  -7.4748,  -8.3463,  -9.9781,
+                -10.8389, -10.3105, -11.7201,  -9.7261,  -7.1590,  -5.9272, -12.4509,
+                -11.1146,  -8.1918
             ]
         )
         # fmt: on
@@ -844,10 +843,10 @@ def test_large_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                1.8844, 0.8033, 4.4131, 3.5382, 2.3053, 3.8265, -0.7464, 2.4677,
-                1.7290, 1.8508, 2.2446, 1.1164, 0.3844, 1.9060, 1.4199, 2.4646,
-                1.1612, 0.6382, 1.0921, 1.8465, 2.3622, 1.6158, 2.3126, 1.2661,
-                1.9403, 1.7156, 3.7835, 5.3524, 4.3426, 3.8247
+                2.1382,  0.9381,  4.4671,  3.5589,  2.4022,  3.8576, -0.6521,  2.5472,
+                1.8301,  1.9957,  2.3432,  1.4678,  0.5459,  2.2597,  1.5179,  2.5357,
+                1.1624,  0.6194,  1.0757,  1.8259,  2.4076,  1.6601,  2.3503,  1.3376,
+                1.9891,  1.8635,  3.8931,  5.3699,  4.4772,  3.9184
             ]
         )
         # fmt: on
@@ -870,10 +869,10 @@ def test_tiny_en_generation(self):
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
         generated_ids = model.generate(input_features, num_beams=5, forced_bos_token_id=50362)
-        transcript = tokenizer.batch_decode(generated_ids, normalize = False)[0]
+        transcript = tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            "<|startoftranscript|><|notimestamps|> Mr. Quilter is the apostle of the middle"
             " classes, and we are glad to"
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -895,10 +894,10 @@ def test_tiny_generation(self):
 
         decoder_input_ids = torch.tensor([[50258]]).long()
         generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids=decoder_input_ids)
-        transcript = tokenizer.decode(generated_ids[0], normalize = False)
+        transcript = tokenizer.decode(generated_ids[0])
 
         EXPECTED_TRANSCRIPT = (
-            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>  Mr. Quilter is the apostle of the middle"
+            "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
             " classes and we are glad"
         )
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -930,7 +929,7 @@ def test_large_generation(self):
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, normalize = False)[0]
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -966,7 +965,7 @@ def test_large_generation_multilingual(self):
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, normalize = False)[0]
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " 木村さんに電話を貸してもらいましょう。 I"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -978,19 +977,19 @@ def test_large_generation_multilingual(self):
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, normalize = False)[0]
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kaite moraimashita."
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-        decoder_input_ids = torch.tensor([[50258, 50357]]).long().to(torch_device)
+        decoder_input_ids = torch.tensor([[50258, 50266, 50358, 50363]]).long().to(torch_device)
         generated_ids = model.generate(
             input_features,
             do_sample=False,
             logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, normalize = False)[0]
+        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = "I borrowed a phone from Kimura san"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)

From 07dd529b5f664d67f39281bb40ae917113015a68 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 16:51:44 +0000
Subject: [PATCH 111/156] add `is_more_itertools_available` to utils

---
 src/transformers/utils/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 9572a673f6718..794c3cd7114bd 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -87,6 +87,7 @@
     _LazyModule,
     ccl_version,
     is_accelerate_available,
+    is_more_itertools_available,
     is_apex_available,
     is_bitsandbytes_available,
     is_coloredlogs_available,

From bbafa5836341934e9f7b263ff54cd68efcefc1a8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 29 Sep 2022 17:01:48 +0000
Subject: [PATCH 112/156] quality

---
 .../whisper/feature_extraction_whisper.py     |  2 +-
 src/transformers/utils/__init__.py            |  2 +-
 tests/models/whisper/test_modeling_whisper.py | 34 +++++++++----------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index eb70c7ee4a5bd..96bbf5fa3663c 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -128,7 +128,7 @@ def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
 
     def fram_wave(self, waveform, center=True):
         frames = []
-        for i in range(0, waveform.shape[0]+1, self.hop_length):
+        for i in range(0, waveform.shape[0] + 1, self.hop_length):
             half_window = (self.n_fft - 1) // 2 + 1
             if center:
                 start = i - half_window if i > half_window else 0
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 794c3cd7114bd..08ddd5f7fc5db 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -87,7 +87,6 @@
     _LazyModule,
     ccl_version,
     is_accelerate_available,
-    is_more_itertools_available,
     is_apex_available,
     is_bitsandbytes_available,
     is_coloredlogs_available,
@@ -99,6 +98,7 @@
     is_in_notebook,
     is_ipex_available,
     is_librosa_available,
+    is_more_itertools_available,
     is_ninja_available,
     is_onnx_available,
     is_pandas_available,
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 3c011736be28a..f1d1c53452361 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -757,10 +757,10 @@ def test_tiny_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.9892, -6.7607,  5.7348,  3.6096,  0.2152, -5.7321,  4.8855, -1.6407,
-                0.2823, -1.5718, 10.4269,  3.4427,  0.0219, -8.0612,  3.4784,  8.4246,
-                4.0575, -2.2864, 11.1084,  0.9963,  0.9884, -8.5154, -3.5469, -9.3713,
-                0.9786,  3.5435,  7.4850, -5.2579, -1.4366, 10.4841
+                2.9892, -6.7607, 5.7348, 3.6096, 0.2152, -5.7321, 4.8855, -1.6407,
+                0.2823, -1.5718, 10.4269, 3.4427, 0.0219, -8.0612, 3.4784, 8.4246,
+                4.0575, -2.2864, 11.1084, 0.9963, 0.9884, -8.5154, -3.5469, -9.3713,
+                0.9786, 3.5435, 7.4850, -5.2579, -1.4366, 10.4841
             ]
         )
         # fmt: on
@@ -769,10 +769,10 @@ def test_tiny_logits_librispeech(self):
         # fmt: off
         EXPECTED_GENERATION = torch.tensor(
             [
-                -1.4651, -2.6944,  2.7821,  2.3793,  4.0738,  0.0188, -3.3203,  1.9836,
-                0.0520,  0.7095,  1.1063,  0.2952, -3.6786, -0.5249,  0.3105,  4.7691,
-                1.1562,  1.3046,  0.5810, -0.3624,  1.7006,  1.3424,  0.9817,  2.1958,
-                1.8775, -5.7046, -0.7679,  4.0113,  2.6848,  2.8609
+                -1.4651, -2.6944, 2.7821, 2.3793, 4.0738, 0.0188, -3.3203, 1.9836,
+                0.0520, 0.7095, 1.1063, 0.2952, -3.6786, -0.5249, 0.3105, 4.7691,
+                1.1562, 1.3046, 0.5810, -0.3624, 1.7006, 1.3424, 0.9817, 2.1958,
+                1.8775, -5.7046, -0.7679, 4.0113, 2.6848, 2.8609
             ]
         )
         # fmt: on
@@ -805,11 +805,11 @@ def test_small_en_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                -3.6784,  -7.7211,  -9.5070, -11.9286,  -7.6489,  -9.7026,  -5.6188,
-                -8.0104,  -4.6238,  -5.1833,  -9.0485,  -3.4079,  -5.4874,  -2.6935,
-                -6.3479,  -7.3398,  -6.9558,  -7.6867,  -7.4748,  -8.3463,  -9.9781,
-                -10.8389, -10.3105, -11.7201,  -9.7261,  -7.1590,  -5.9272, -12.4509,
-                -11.1146,  -8.1918
+                -3.6784, -7.7211, -9.5070, -11.9286, -7.6489, -9.7026, -5.6188,
+                -8.0104, -4.6238, -5.1833, -9.0485, -3.4079, -5.4874, -2.6935,
+                -6.3479, -7.3398, -6.9558, -7.6867, -7.4748, -8.3463, -9.9781,
+                -10.8389, -10.3105, -11.7201, -9.7261, -7.1590, -5.9272, -12.4509,
+                -11.1146, -8.1918
             ]
         )
         # fmt: on
@@ -843,10 +843,10 @@ def test_large_logits_librispeech(self):
         # fmt: off
         EXPECTED_LOGITS = torch.tensor(
             [
-                2.1382,  0.9381,  4.4671,  3.5589,  2.4022,  3.8576, -0.6521,  2.5472,
-                1.8301,  1.9957,  2.3432,  1.4678,  0.5459,  2.2597,  1.5179,  2.5357,
-                1.1624,  0.6194,  1.0757,  1.8259,  2.4076,  1.6601,  2.3503,  1.3376,
-                1.9891,  1.8635,  3.8931,  5.3699,  4.4772,  3.9184
+                2.1382, 0.9381, 4.4671, 3.5589, 2.4022, 3.8576, -0.6521, 2.5472,
+                1.8301, 1.9957, 2.3432, 1.4678, 0.5459, 2.2597, 1.5179, 2.5357,
+                1.1624, 0.6194, 1.0757, 1.8259, 2.4076, 1.6601, 2.3503, 1.3376,
+                1.9891, 1.8635, 3.8931, 5.3699, 4.4772, 3.9184
             ]
         )
         # fmt: on

From 07164fa164eb465d6b16b5f762898be868dd7d37 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 07:58:59 +0000
Subject: [PATCH 113/156] add begin supress tokens, supress tokens to generate
 args and config

---
 src/transformers/configuration_utils.py         |  2 ++
 src/transformers/generation_utils.py            | 17 +++++++++++++++++
 .../models/whisper/configuration_whisper.py     | 11 ++++++++---
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 3fdc0f265f633..74f2f7e81e856 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -299,6 +299,8 @@ def __init__(self, **kwargs):
         self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
         self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)
         self.exponential_decay_length_penalty = kwargs.pop("exponential_decay_length_penalty", None)
+        self.supress_tokens = kwargs.pop("supress_tokens", None)
+        self.begin_supress_tokens = kwargs.pop("begin_supress_tokens", None)
 
         # Fine-tuning task arguments
         self.architectures = kwargs.pop("architectures", None)
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 71db5532ea38d..85027e0bee266 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -39,6 +39,7 @@
     NoRepeatNGramLogitsProcessor,
     PrefixConstrainedLogitsProcessor,
     RepetitionPenaltyLogitsProcessor,
+    SuppressTokensLogitsProcessor,
     TemperatureLogitsWarper,
     TopKLogitsWarper,
     TopPLogitsWarper,
@@ -691,6 +692,8 @@ def _get_logits_processor(
         exponential_decay_length_penalty: Tuple,
         logits_processor: Optional[LogitsProcessorList],
         renormalize_logits: Optional[bool],
+        supress_tokens: Optional[List[int]],
+        begin_supress_tokens: Optional[List[int]],
     ) -> LogitsProcessorList:
         """
         This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
@@ -725,6 +728,10 @@ def _get_logits_processor(
             if exponential_decay_length_penalty is not None
             else self.config.exponential_decay_length_penalty
         )
+        supress_tokens = supress_tokens if supress_tokens is not None else self.config.supress_tokens
+        begin_supress_tokens = (
+            begin_supress_tokens if begin_supress_tokens is not None else self.config.begin_supress_tokens
+        )
         # instantiate processors list
 
         # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
@@ -762,6 +769,9 @@ def _get_logits_processor(
             processors.append(
                 ExponentialDecayLengthPenalty(exponential_decay_length_penalty, eos_token_id, input_ids_seq_length)
             )
+        if supress_tokens is not None:
+            processors.append(SuppressTokensLogitsProcessor(supress_tokens, begin_supress_tokens))
+
         processors = self._merge_criteria_processor_list(processors, logits_processor)
         # `LogitNormalization` should always be the last logit processor, when present
         if renormalize_logits is True:
@@ -932,6 +942,8 @@ def generate(
         remove_invalid_values: Optional[bool] = None,
         synced_gpus: Optional[bool] = False,
         exponential_decay_length_penalty: Optional[Tuple[Union[int, float]]] = None,
+        supress_tokens: Optional[List[int]] = None,
+        begin_supress_tokens: Optional[List[int]] = None,
         **model_kwargs,
     ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, torch.LongTensor]:
         r"""
@@ -1090,6 +1102,9 @@ def generate(
                 This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
                 generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates
                 where penalty starts and `decay_factor` represents the factor of exponential decay
+            supress_tokens  (`List[int]`, *optional*, defaults to `model.config.supress_tokens`):
+                A list of tokens that will be supressed at generation. Teh `SupressTokens` logit processor will set
+                their log probs to `-inf` so that they are not sampled.
 
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If the model
@@ -1337,6 +1352,8 @@ def generate(
             exponential_decay_length_penalty=exponential_decay_length_penalty,
             logits_processor=logits_processor,
             renormalize_logits=renormalize_logits,
+            supress_tokens=supress_tokens,
+            begin_supress_tokens=begin_supress_tokens,
         )
 
         # 8. prepare stopping criteria
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 2ea3eeda4d1af..b472b86a5dc88 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -122,9 +122,12 @@ class WhisperConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to True):
             Whether to tie input and output embeddings.
-        non_speech_tokens (`List[int]`, *optional*, defaults to None):
+        supress_tokens (`List[int]`, *optional*, defaults to NON_SPEECH_TOKENS):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
             function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here.
+        begin_supress_tokens (`List[int]`, *optional*, defaults to [220,50256]]):
+            A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
+            the token for " " (`blank_token_id`) and the `eos_token_id`
 
     Example:
 
@@ -172,7 +175,8 @@ def __init__(
         bos_token_id=50257,
         eos_token_id=50256,
         tie_word_embeddings=True,
-        non_speech_tokens=None,
+        supress_tokens=NON_SPEECH_TOKENS,
+        begin_supress_tokens=[220, 50256],
         **kwargs
     ):
         """_summary_
@@ -200,7 +204,6 @@ def __init__(
         self.tie_word_embeddings = tie_word_embeddings
         self.max_source_positions = max_source_positions
         self.max_target_positions = max_target_positions
-        self.non_speech_tokens = non_speech_tokens
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
@@ -208,5 +211,7 @@ def __init__(
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
             tie_word_embeddings=tie_word_embeddings,
+            supress_tokens=supress_tokens,
+            begin_supress_tokens=begin_supress_tokens,
             **kwargs,
         )

From fd0c7e9e49c29b88916dd428522caeae83075e7f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 07:59:29 +0000
Subject: [PATCH 114/156] clean supressTokensLogitProcessor in generation
 logits

---
 src/transformers/generation_logits_process.py | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 6f18947df2f8d..fdfd9155b69bd 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -704,29 +704,16 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
         return scores
 
 
-class SuppressBlank(LogitsProcessor):
+class SuppressTokensLogitsProcessor(LogitsProcessor):
     r""" """
 
-    def __init__(self, blank_token_id, eos_token_id, sample_begin: int = 1):
-        self.blank_token_id = blank_token_id
-        self.eos_token_id = eos_token_id
-        self.sample_begin = sample_begin
-
-    def __call__(self, input_ids, scores):
-        tokens = input_ids
-        logits = scores
-        if tokens.shape[1] == self.sample_begin:
-            logits[:, self.blank_token_id + [self.eos_token_id]] = -np.inf
-        return logits
-
-
-class SuppressTokens(LogitsProcessor):
-    r""" """
-
-    def __init__(self, suppress_tokens):
+    def __init__(self, suppress_tokens, begining_tokens: Optional[List[int]] = None):
         self.suppress_tokens = list(suppress_tokens)
+        self.begining_tokens - list(begining_tokens)
 
     def __call__(self, input_ids, scores):
-        logits = scores
-        logits[:, self.suppress_tokens] = -np.inf
-        return logits
+        scores[:, self.suppress_tokens] = -np.inf
+        if input_ids.shape[1] == 1 and self.begining_tokens is not None:
+            scores[:, self.begining_tokens] = -np.inf
+
+        return scores

From 1f4fe24d081fea8486741613f61c89e63df4dbdd Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 08:00:01 +0000
Subject: [PATCH 115/156] Nit naming

---
 src/transformers/generation_logits_process.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index fdfd9155b69bd..0eb4b5737bb9b 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -702,18 +702,16 @@ class LogitNormalization(LogitsProcessor, LogitsWarper):
     def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
         scores = scores.log_softmax(dim=-1)
         return scores
-
-
 class SuppressTokensLogitsProcessor(LogitsProcessor):
     r""" """
 
-    def __init__(self, suppress_tokens, begining_tokens: Optional[List[int]] = None):
+    def __init__(self, suppress_tokens, begin_supress_tokens: Optional[List[int]] = None):
         self.suppress_tokens = list(suppress_tokens)
-        self.begining_tokens - list(begining_tokens)
+        self.begin_supress_tokens - list(begin_supress_tokens)
 
     def __call__(self, input_ids, scores):
         scores[:, self.suppress_tokens] = -np.inf
-        if input_ids.shape[1] == 1 and self.begining_tokens is not None:
-            scores[:, self.begining_tokens] = -np.inf
+        if input_ids.shape[1] == 1 and self.begin_supress_tokens is not None :
+            scores[:, self.begin_supress_tokens] = -np.inf
 
         return scores

From 848f1c3fc8a4c4124d8c0064328af29f6d62d708 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 08:48:47 +0000
Subject: [PATCH 116/156] add supressTokensAtBegin

---
 src/transformers/generation_logits_process.py | 20 ++++++++++++++-----
 src/transformers/generation_utils.py          |  7 ++++++-
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 0eb4b5737bb9b..f43bf39f78362 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -702,16 +702,26 @@ class LogitNormalization(LogitsProcessor, LogitsWarper):
     def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
         scores = scores.log_softmax(dim=-1)
         return scores
+
+
+class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
+    def __init__(self, begin_supress_tokens, begin_index):
+        self.begin_supress_tokens = list(begin_supress_tokens)
+        self.begin_index = begin_index
+
+    def __call__(self, input_ids, scores):
+        if input_ids.shape[1] == self.begin_index:
+            scores[:, self.begin_supress_tokens] = -np.inf
+
+        return scores
+
+
 class SuppressTokensLogitsProcessor(LogitsProcessor):
     r""" """
 
-    def __init__(self, suppress_tokens, begin_supress_tokens: Optional[List[int]] = None):
+    def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)
-        self.begin_supress_tokens - list(begin_supress_tokens)
 
     def __call__(self, input_ids, scores):
         scores[:, self.suppress_tokens] = -np.inf
-        if input_ids.shape[1] == 1 and self.begin_supress_tokens is not None :
-            scores[:, self.begin_supress_tokens] = -np.inf
-
         return scores
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 85027e0bee266..61542659c7682 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -39,6 +39,7 @@
     NoRepeatNGramLogitsProcessor,
     PrefixConstrainedLogitsProcessor,
     RepetitionPenaltyLogitsProcessor,
+    SuppressTokensAtBeginLogitsProcessor,
     SuppressTokensLogitsProcessor,
     TemperatureLogitsWarper,
     TopKLogitsWarper,
@@ -770,7 +771,11 @@ def _get_logits_processor(
                 ExponentialDecayLengthPenalty(exponential_decay_length_penalty, eos_token_id, input_ids_seq_length)
             )
         if supress_tokens is not None:
-            processors.append(SuppressTokensLogitsProcessor(supress_tokens, begin_supress_tokens))
+            processors.append(SuppressTokensLogitsProcessor(supress_tokens))
+        if begin_supress_tokens is not None:
+            begin_index = input_ids_seq_length
+            begin_index = begin_index if (input_ids_seq_length > 1 or forced_bos_token_id is None) else begin_index + 1
+            processors.append(SuppressTokensAtBeginLogitsProcessor(begin_supress_tokens, begin_index))
 
         processors = self._merge_criteria_processor_list(processors, logits_processor)
         # `LogitNormalization` should always be the last logit processor, when present

From 24980861dc9eb538883e48630be3ad0797489d12 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 08:49:07 +0000
Subject: [PATCH 117/156] udpate tests, supress tokens to None or correct
 values

---
 tests/models/whisper/test_modeling_whisper.py | 36 ++++++++-----------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index f1d1c53452361..6940f051495a9 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -21,6 +21,7 @@
 import unittest
 
 from transformers import WhisperConfig
+from transformers.models.whisper.configuration_whisper import NON_SPEECH_TOKENS_MULTI
 from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device
 from transformers.utils import cached_property
 from transformers.utils.import_utils import is_datasets_available
@@ -45,7 +46,6 @@
         WhisperTokenizer,
         set_seed,
     )
-    from transformers.generation_logits_process import LogitsProcessorList, SuppressBlank, SuppressTokens
     from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder
 
 
@@ -104,6 +104,8 @@ def __init__(
         num_mel_bins=80,
         decoder_start_token_id=85,
         num_conv_layers=1,
+        supress_tokens=None,
+        begin_supress_tokens=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -127,6 +129,8 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.decoder_start_token_id = decoder_start_token_id
         self.num_conv_layers = num_conv_layers
+        self.supress_tokens = supress_tokens
+        self.begin_supress_tokens = begin_supress_tokens
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
@@ -162,6 +166,8 @@ def get_config(self):
             decoder_ffn_dim=self.hidden_size,
             encoder_ffn_dim=self.hidden_size,
             decoder_start_token_id=self.decoder_start_token_id,
+            supress_tokens=self.supress_tokens,
+            begin_supress_tokens=self.begin_supress_tokens,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -884,7 +890,8 @@ def test_tiny_generation(self):
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
-
+        model.config.begin_supress_tokens = None
+        model.config.supress_tokens = None
         input_speech = self._load_datasamples(1)
         feaure_extractor = WhisperFeatureExtractor()
 
@@ -916,22 +923,15 @@ def test_large_generation(self):
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
 
-        logits_processor = LogitsProcessorList(
-            [
-                SuppressBlank(tokenizer.encode(" "), 50256),
-                SuppressTokens(model.config.non_speech_tokens),
-            ]
-        )
         decoder_input_ids = torch.tensor([[50258]]).long()
         generated_ids = model.generate(
             input_features,
             do_sample=False,
-            logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
+        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we're glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
     @slow
@@ -940,6 +940,7 @@ def test_large_generation_multilingual(self):
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
         model.to(torch_device)
+        model.config.supress_tokens = NON_SPEECH_TOKENS_MULTI
 
         ds = load_dataset("common_voice", "ja", split="test", streaming=True)
         ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
@@ -952,29 +953,22 @@ def test_large_generation_multilingual(self):
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
 
-        logits_processor = LogitsProcessorList(
-            [
-                SuppressBlank(tokenizer.encode(" "), 50256),
-                SuppressTokens(model.config.non_speech_tokens),
-            ]
-        )
+        model.config.begin_supress_tokens = [tokenizer.encode(" ")[0], tokenizer.eos_token_id]
         decoder_input_ids = torch.tensor([[50258, 50359, 50266, 50363]]).long().to(torch_device)
         generated_ids = model.generate(
             input_features,
             do_sample=True,
-            logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = " 木村さんに電話を貸してもらいましょう。 I"
+        EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました。 木"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
         decoder_input_ids = torch.tensor([[50258, 50359, 50357]]).long().to(torch_device)
         generated_ids = model.generate(
             input_features,
             do_sample=False,
-            logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -986,10 +980,10 @@ def test_large_generation_multilingual(self):
         generated_ids = model.generate(
             input_features,
             do_sample=False,
-            logits_processor=logits_processor,
             decoder_input_ids=decoder_input_ids,
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = "I borrowed a phone from Kimura san"
+        EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san. Thank you for watching. Please subscribe"
+        # should only be "I borrowed a phone from Kimura san. But it seems like it is a well known bug"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)

From 3269d570b35a3e808e028bcfc5aee943cd124631 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 08:55:14 +0000
Subject: [PATCH 118/156] nit and style

---
 docs/source/en/_toctree.yml                            | 10 +++++-----
 .../models/whisper/configuration_whisper.py            |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index df2132568b973..e96c26aeb07ca 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -42,7 +42,8 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - sections:
+    - isExpanded: false
+      sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -58,7 +59,6 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
-      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -441,12 +441,12 @@
         title: Wav2Vec2Phoneme
       - local: model_doc/wavlm
         title: WavLM
+      - local: model_doc/whisper
+        title: Whisper
       - local: model_doc/xls_r
         title: XLS-R
       - local: model_doc/xlsr_wav2vec2
         title: XLSR-Wav2Vec2
-      - local: model_doc/whisper
-        title: Whisper
       title: Audio models
     - isExpanded: false
       sections:
@@ -509,4 +509,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
\ No newline at end of file
+  title: API
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index b472b86a5dc88..3271bc8756e9b 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -129,6 +129,7 @@ class WhisperConfig(PretrainedConfig):
             A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
             the token for " " (`blank_token_id`) and the `eos_token_id`
 
+
     Example:
 
     ```python

From 6b2ebd4590c34c8bef9597a54baa2e1b51a7446b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 11:24:34 +0000
Subject: [PATCH 119/156] update RAG to fit test and generate_logit

---
 src/transformers/models/rag/modeling_rag.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 45b606905362f..c3f870e123153 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -1633,6 +1633,8 @@ def extend_enc_output(tensor, num_beams=None):
             exponential_decay_length_penalty=exponential_decay_length_penalty,
             logits_processor=logits_processor,
             renormalize_logits=renormalize_logits,
+            supress_tokens = None, # Not used for RAG, were added after the whisper pull request
+            begin_supress_tokens = None  # Not used for RAG, were added after the whisper pull request
         )
 
         if num_beams == 1:

From dff15c219c2b1d2d1b786c2af108db9c05aae812 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 11:47:14 +0000
Subject: [PATCH 120/156] add copy pasted statment on english normalizer

---
 src/transformers/models/whisper/english_normalizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index d9570537cea0e..405e33c89409e 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -1,4 +1,5 @@
 # Copyright 2022 The OpenAI team and The HuggingFace Team. All rights reserved.
+# Most of the code is copy pasted from the original whisper repository
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 7c51de1105ebc2146c3229a726bd3c3af25c2cb8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 12:04:12 +0000
Subject: [PATCH 121/156] add arguments to config_common_kwargs

---
 src/transformers/models/rag/modeling_rag.py | 4 ++--
 tests/test_configuration_common.py          | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index c3f870e123153..62b2f80efbdc7 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -1633,8 +1633,8 @@ def extend_enc_output(tensor, num_beams=None):
             exponential_decay_length_penalty=exponential_decay_length_penalty,
             logits_processor=logits_processor,
             renormalize_logits=renormalize_logits,
-            supress_tokens = None, # Not used for RAG, were added after the whisper pull request
-            begin_supress_tokens = None  # Not used for RAG, were added after the whisper pull request
+            supress_tokens=None,  # Not used for RAG, were added after the whisper pull request
+            begin_supress_tokens=None,  # Not used for RAG, were added after the whisper pull request
         )
 
         if num_beams == 1:
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index c2d48ef662541..51288377a3adc 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -84,6 +84,8 @@
     "sep_token_id": 9,
     "decoder_start_token_id": 10,
     "exponential_decay_length_penalty": (5, 1.01),
+    "supress_tokens": None,
+    "begin_supress_tokens": None,
     "task_specific_params": {"translation": "some_params"},
     "problem_type": "regression",
 }

From da99700e0d4ca8660c699002b87bde8b0669cfa9 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 30 Sep 2022 14:20:32 +0200
Subject: [PATCH 122/156] Update src/transformers/generation_utils.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/generation_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 61542659c7682..847178d1f5eb7 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1108,7 +1108,7 @@ def generate(
                 generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates
                 where penalty starts and `decay_factor` represents the factor of exponential decay
             supress_tokens  (`List[int]`, *optional*, defaults to `model.config.supress_tokens`):
-                A list of tokens that will be supressed at generation. Teh `SupressTokens` logit processor will set
+                A list of tokens that will be supressed at generation. The `SupressTokens` logit processor will set
                 their log probs to `-inf` so that they are not sampled.
 
             model_kwargs:

From af1beac52e18b0e834915632b20d0b74fb1f2572 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 30 Sep 2022 14:34:49 +0200
Subject: [PATCH 123/156] Update src/transformers/generation_logits_process.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/generation_logits_process.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index f43bf39f78362..649fcd898adfb 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -717,7 +717,7 @@ def __call__(self, input_ids, scores):
 
 
 class SuppressTokensLogitsProcessor(LogitsProcessor):
-    r""" """
+    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they are not sampled."""
 
     def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)

From 8277239c7432bea24f13e79d1e0b490a2c0e2791 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 30 Sep 2022 17:25:09 +0200
Subject: [PATCH 124/156] Update
 src/transformers/models/whisper/configuration_whisper.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/whisper/configuration_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 3271bc8756e9b..334badb6453e0 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -87,7 +87,7 @@ class WhisperConfig(PretrainedConfig):
         decoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
-        decoder_start_token_id (`int`, *optional*, defaults to 50258):
+        decoder_start_token_id (`int`, *optional*, defaults to 50257):
             Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
             are provided to the `generate`function
         use_cache (`bool`, *optional*, defaults to True):

From 7b5e7939b2e9d504e524ec284f013a36ab4ff245 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Fri, 30 Sep 2022 17:28:49 +0200
Subject: [PATCH 125/156] Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/whisper.mdx                      | 2 +-
 src/transformers/models/whisper/configuration_whisper.py  | 8 ++++----
 .../models/whisper/feature_extraction_whisper.py          | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/en/model_doc/whisper.mdx b/docs/source/en/model_doc/whisper.mdx
index 8186ed22d5186..beb7bf3798bcb 100644
--- a/docs/source/en/model_doc/whisper.mdx
+++ b/docs/source/en/model_doc/whisper.mdx
@@ -24,7 +24,7 @@ The abstract from the paper is the following:
 Tips:
 
 - The model usually performs well without requiring any finetuning. 
-- The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`generate`](https://huggingface.co/docs/transformers/v4.22.2/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate) function for inference.
+- The architecture follows a classic encoder-decoder architecture, which means that it relies on the [`~generation_utils.GenerationMixin.generate`] function for inference.
 - One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text.
 
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 334badb6453e0..2c4e9ded52903 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -120,14 +120,14 @@ class WhisperConfig(PretrainedConfig):
             Begin of stream token id.
         eos_token_id (`int`, *optional*, defaults to 50257):
             End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to True):
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie input and output embeddings.
         supress_tokens (`List[int]`, *optional*, defaults to NON_SPEECH_TOKENS):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
             function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here.
         begin_supress_tokens (`List[int]`, *optional*, defaults to [220,50256]]):
             A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
-            the token for " " (`blank_token_id`) and the `eos_token_id`
+            the token for `" "` (`blank_token_id`) and the `eos_token_id`
 
 
     Example:
@@ -176,8 +176,8 @@ def __init__(
         bos_token_id=50257,
         eos_token_id=50256,
         tie_word_embeddings=True,
-        supress_tokens=NON_SPEECH_TOKENS,
-        begin_supress_tokens=[220, 50256],
+        supress_tokens=None,
+        begin_supress_tokens=None,
         **kwargs
     ):
         """_summary_
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 96bbf5fa3663c..2206e86573c18 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -49,7 +49,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
         chunk_length (`int`, defaults to 30):
             The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
             sequences.
-        n_fft (`int`, defaults to 30):
+        n_fft (`int`, defaults to 400):
             Size of the Fourier transform.
         padding_value (`float`, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.

From 325d0888d3a5de14b61f1c6b4cece160f800651e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 15:30:09 +0000
Subject: [PATCH 126/156] revert changes based on reviews

---
 src/transformers/configuration_utils.py     | 2 --
 src/transformers/models/rag/modeling_rag.py | 2 --
 src/transformers/tokenization_utils_base.py | 1 -
 tests/test_configuration_common.py          | 2 --
 4 files changed, 7 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 74f2f7e81e856..3fdc0f265f633 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -299,8 +299,6 @@ def __init__(self, **kwargs):
         self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
         self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)
         self.exponential_decay_length_penalty = kwargs.pop("exponential_decay_length_penalty", None)
-        self.supress_tokens = kwargs.pop("supress_tokens", None)
-        self.begin_supress_tokens = kwargs.pop("begin_supress_tokens", None)
 
         # Fine-tuning task arguments
         self.architectures = kwargs.pop("architectures", None)
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 62b2f80efbdc7..45b606905362f 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -1633,8 +1633,6 @@ def extend_enc_output(tensor, num_beams=None):
             exponential_decay_length_penalty=exponential_decay_length_penalty,
             logits_processor=logits_processor,
             renormalize_logits=renormalize_logits,
-            supress_tokens=None,  # Not used for RAG, were added after the whisper pull request
-            begin_supress_tokens=None,  # Not used for RAG, were added after the whisper pull request
         )
 
         if num_beams == 1:
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 65eda3fb0ba32..54d562136db4a 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1822,7 +1822,6 @@ def _from_pretrained(
         if tokenizer_config_file is not None:
             with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
                 init_kwargs = json.load(tokenizer_config_handle)
-
             # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
             config_tokenizer_class = init_kwargs.get("tokenizer_class")
             init_kwargs.pop("tokenizer_class", None)
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index 51288377a3adc..c2d48ef662541 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -84,8 +84,6 @@
     "sep_token_id": 9,
     "decoder_start_token_id": 10,
     "exponential_decay_length_penalty": (5, 1.01),
-    "supress_tokens": None,
-    "begin_supress_tokens": None,
     "task_specific_params": {"translation": "some_params"},
     "problem_type": "regression",
 }

From 2f88dc88d1a53259b399bd4496256bc7b1ecca6e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 15:31:27 +0000
Subject: [PATCH 127/156] update doc and nits

---
 src/transformers/generation_logits_process.py |  5 ++
 src/transformers/generation_utils.py          | 28 ++++-----
 .../models/whisper/configuration_whisper.py   |  4 +-
 .../models/whisper/english_normalizer.py      | 60 ++++++++-----------
 .../models/whisper/modeling_whisper.py        |  7 ++-
 .../models/whisper/processing_whisper.py      |  6 +-
 .../models/whisper/tokenization_whisper.py    |  6 +-
 7 files changed, 53 insertions(+), 63 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 649fcd898adfb..0bcc953b929db 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -705,6 +705,11 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
 
 
 class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
+    r"""
+    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function's starts generating 
+    using `begin_index` tokens. This usually happens when `use_cache` us set to `True`, and the `decoder_input_ids` only include
+    the previously generated token along with and `eos_token` if `forced_eos_token` is used.
+    """
     def __init__(self, begin_supress_tokens, begin_index):
         self.begin_supress_tokens = list(begin_supress_tokens)
         self.begin_index = begin_index
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 847178d1f5eb7..e4c87268e5899 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -693,8 +693,8 @@ def _get_logits_processor(
         exponential_decay_length_penalty: Tuple,
         logits_processor: Optional[LogitsProcessorList],
         renormalize_logits: Optional[bool],
-        supress_tokens: Optional[List[int]],
-        begin_supress_tokens: Optional[List[int]],
+        suppress_tokens: Optional[List[int]] = None,
+        begin_suppress_tokens: Optional[List[int]] = None,
     ) -> LogitsProcessorList:
         """
         This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
@@ -729,9 +729,9 @@ def _get_logits_processor(
             if exponential_decay_length_penalty is not None
             else self.config.exponential_decay_length_penalty
         )
-        supress_tokens = supress_tokens if supress_tokens is not None else self.config.supress_tokens
-        begin_supress_tokens = (
-            begin_supress_tokens if begin_supress_tokens is not None else self.config.begin_supress_tokens
+        suppress_tokens = suppress_tokens if suppress_tokens is not None else self.config.suppress_tokens
+        begin_suppress_tokens = (
+            begin_suppress_tokens if begin_suppress_tokens is not None else self.config.begin_suppress_tokens
         )
         # instantiate processors list
 
@@ -770,12 +770,12 @@ def _get_logits_processor(
             processors.append(
                 ExponentialDecayLengthPenalty(exponential_decay_length_penalty, eos_token_id, input_ids_seq_length)
             )
-        if supress_tokens is not None:
-            processors.append(SuppressTokensLogitsProcessor(supress_tokens))
-        if begin_supress_tokens is not None:
+        if suppress_tokens is not None:
+            processors.append(SuppressTokensLogitsProcessor(suppress_tokens))
+        if begin_suppress_tokens is not None:
             begin_index = input_ids_seq_length
             begin_index = begin_index if (input_ids_seq_length > 1 or forced_bos_token_id is None) else begin_index + 1
-            processors.append(SuppressTokensAtBeginLogitsProcessor(begin_supress_tokens, begin_index))
+            processors.append(SuppressTokensAtBeginLogitsProcessor(begin_suppress_tokens, begin_index))
 
         processors = self._merge_criteria_processor_list(processors, logits_processor)
         # `LogitNormalization` should always be the last logit processor, when present
@@ -947,8 +947,8 @@ def generate(
         remove_invalid_values: Optional[bool] = None,
         synced_gpus: Optional[bool] = False,
         exponential_decay_length_penalty: Optional[Tuple[Union[int, float]]] = None,
-        supress_tokens: Optional[List[int]] = None,
-        begin_supress_tokens: Optional[List[int]] = None,
+        suppress_tokens: Optional[List[int]] = None,
+        begin_suppress_tokens: Optional[List[int]] = None,
         **model_kwargs,
     ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, torch.LongTensor]:
         r"""
@@ -1107,7 +1107,7 @@ def generate(
                 This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been
                 generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates
                 where penalty starts and `decay_factor` represents the factor of exponential decay
-            supress_tokens  (`List[int]`, *optional*, defaults to `model.config.supress_tokens`):
+            suppress_tokens  (`List[int]`, *optional*, defaults to `model.config.suppress_tokens`):
                 A list of tokens that will be supressed at generation. The `SupressTokens` logit processor will set
                 their log probs to `-inf` so that they are not sampled.
 
@@ -1357,8 +1357,8 @@ def generate(
             exponential_decay_length_penalty=exponential_decay_length_penalty,
             logits_processor=logits_processor,
             renormalize_logits=renormalize_logits,
-            supress_tokens=supress_tokens,
-            begin_supress_tokens=begin_supress_tokens,
+            suppress_tokens=suppress_tokens,
+            begin_suppress_tokens=begin_suppress_tokens,
         )
 
         # 8. prepare stopping criteria
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 3271bc8756e9b..ec89aa6dd1d1e 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -122,10 +122,10 @@ class WhisperConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to True):
             Whether to tie input and output embeddings.
-        supress_tokens (`List[int]`, *optional*, defaults to NON_SPEECH_TOKENS):
+        supress_tokens (`List[int]`, *optional*, defaults to None):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
             function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here.
-        begin_supress_tokens (`List[int]`, *optional*, defaults to [220,50256]]):
+        begin_supress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
             A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
             the token for " " (`blank_token_id`) and the `eos_token_id`
 
diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index 405e33c89409e..f902f98f5e7bf 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -19,7 +19,7 @@
 from fractions import Fraction
 from typing import Iterator, List, Match, Optional, Union
 
-from transformers.utils.import_utils import is_more_itertools_available
+from ...utils import is_more_itertools_available
 
 
 if is_more_itertools_available():
@@ -56,18 +56,23 @@ def remove_symbols_and_diacritics(s: str, keep=""):
     Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
     manual mappings)
     """
-    return "".join(
-        c
-        if c in keep
-        else ADDITIONAL_DIACRITICS[c]
-        if c in ADDITIONAL_DIACRITICS
-        else ""
-        if unicodedata.category(c) == "Mn"
-        else " "
-        if unicodedata.category(c)[0] in "MSP"
-        else c
-        for c in unicodedata.normalize("NFKD", s)
-    )
+
+    def replace_character(char):
+        if char in keep :
+            return char
+        elif char in ADDITIONAL_DIACRITICS:
+            return ADDITIONAL_DIACRITICS[char] 
+
+        elif unicodedata.category(char) == "Mn":
+            return ""
+
+        elif unicodedata.category(char)[0] in "MSP":
+            return " "
+        
+        return char
+
+
+    return "".join( replace_character(c) for c in unicodedata.normalize("NFKD", s))
 
 
 def remove_symbols(s: str):
@@ -111,33 +116,16 @@ def __init__(self):
         super().__init__()
 
         self.zeros = {"o", "oh", "zero"}
+        # fmt: off
         self.ones = {
             name: i
             for i, name in enumerate(
-                [
-                    "one",
-                    "two",
-                    "three",
-                    "four",
-                    "five",
-                    "six",
-                    "seven",
-                    "eight",
-                    "nine",
-                    "ten",
-                    "eleven",
-                    "twelve",
-                    "thirteen",
-                    "fourteen",
-                    "fifteen",
-                    "sixteen",
-                    "seventeen",
-                    "eighteen",
-                    "nineteen",
+                ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
                 ],
                 start=1,
             )
         }
+        # fmt: on
         self.ones_plural = {
             "sixes" if name == "six" else name + "s": (value, "s") for name, value in self.ones.items()
         }
@@ -267,7 +255,9 @@ def output(result: Union[str, int]):
             if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
                 # arabic numbers (potentially with signs and fractions)
                 f = to_fraction(current_without_prefix)
-                assert f is not None
+                if f is not None: 
+                    raise ValueError("Converting the fraction failed")
+
                 if value is not None:
                     if isinstance(value, str) and value.endswith("."):
                         # concatenate decimals / ip address components
@@ -295,7 +285,6 @@ def output(result: Union[str, int]):
                     value = ones
                 elif isinstance(value, str) or prev in self.ones:
                     if prev in self.tens and ones < 10:  # replace the last zero with the digit
-                        assert value[-1] == "0"
                         value = value[:-1] + str(ones)
                     else:
                         value = str(value) + str(ones)
@@ -316,7 +305,6 @@ def output(result: Union[str, int]):
                     yield output(str(ones) + suffix)
                 elif isinstance(value, str) or prev in self.ones:
                     if prev in self.tens and ones < 10:
-                        assert value[-1] == "0"
                         yield output(value[:-1] + str(ones) + suffix)
                     else:
                         yield output(str(value) + str(ones) + suffix)
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 768e1d18f3349..c5f4733f5c780 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -20,7 +20,6 @@
 from typing import Optional, Tuple
 
 import torch
-import torch.nn.functional as F
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
@@ -134,9 +133,11 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
+    # Copied from transformers.models.bart.modeling_bart.forward with BART->whisper
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
+    # Copied from transformers.models.bart.modeling_bart.forward with BART->whisper
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -635,8 +636,8 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        inputs_embeds = F.gelu(self.conv1(input_features))
-        inputs_embeds = F.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
 
         inputs_embeds = inputs_embeds.permute(0, 2, 1)
         embed_pos = self.embed_positions.weight
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 3f25a1a001c9f..914cc41554ba3 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -53,11 +53,7 @@ def __call__(self, *args, **kwargs):
         if self._in_target_context_manager:
             return self.current_processor(*args, **kwargs)
 
-        if "raw_speech" in kwargs:
-            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
-            audio = kwargs.pop("raw_speech")
-        else:
-            audio = kwargs.pop("audio", None)
+        audio = kwargs.pop("audio", None)
         text = kwargs.pop("text", None)
         if len(args) > 0:
             audio = args[0]
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 46ed902b23293..eae091538dff8 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -218,12 +218,12 @@ class WhisperTokenizer(PreTrainedTokenizer):
         errors (`str`, *optional*, defaults to `"replace"`):
             Paradigm to follow when decoding bytes to UTF-8. See
             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The end of sequence token.
         add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any

From 8df5a5872121ce5fcb09b7ad261a343b978ebde0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 15:44:16 +0000
Subject: [PATCH 128/156] more nits

---
 src/transformers/configuration_utils.py       |  2 ++
 src/transformers/generation_logits_process.py |  6 +++---
 .../models/whisper/configuration_whisper.py   | 12 +++++------
 tests/models/whisper/test_modeling_whisper.py | 20 +++++++++----------
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 3fdc0f265f633..a77d93b8b1801 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -299,6 +299,8 @@ def __init__(self, **kwargs):
         self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
         self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)
         self.exponential_decay_length_penalty = kwargs.pop("exponential_decay_length_penalty", None)
+        self.suppress_tokens = kwargs.pop("suppress_tokens", None)
+        self.begin_suppress_tokens = kwargs.pop("begin_suppress_tokens", None)
 
         # Fine-tuning task arguments
         self.architectures = kwargs.pop("architectures", None)
diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 0bcc953b929db..ca55373a0c182 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -710,13 +710,13 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
     using `begin_index` tokens. This usually happens when `use_cache` us set to `True`, and the `decoder_input_ids` only include
     the previously generated token along with and `eos_token` if `forced_eos_token` is used.
     """
-    def __init__(self, begin_supress_tokens, begin_index):
-        self.begin_supress_tokens = list(begin_supress_tokens)
+    def __init__(self, begin_suppress_tokens, begin_index):
+        self.begin_suppress_tokens = list(begin_suppress_tokens)
         self.begin_index = begin_index
 
     def __call__(self, input_ids, scores):
         if input_ids.shape[1] == self.begin_index:
-            scores[:, self.begin_supress_tokens] = -np.inf
+            scores[:, self.begin_suppress_tokens] = -np.inf
 
         return scores
 
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index b783a08c874cc..c943c6c502194 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -122,10 +122,10 @@ class WhisperConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie input and output embeddings.
-        supress_tokens (`List[int]`, *optional*, defaults to None):
+        suppress_tokens (`List[int]`, *optional*, defaults to None):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
             function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here.
-        begin_supress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
+        begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
             A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
             the token for `" "` (`blank_token_id`) and the `eos_token_id`
 
@@ -176,8 +176,8 @@ def __init__(
         bos_token_id=50257,
         eos_token_id=50256,
         tie_word_embeddings=True,
-        supress_tokens=None,
-        begin_supress_tokens=None,
+        suppress_tokens=None,
+        begin_suppress_tokens=[220,50256],
         **kwargs
     ):
         """_summary_
@@ -212,7 +212,7 @@ def __init__(
             is_encoder_decoder=is_encoder_decoder,
             decoder_start_token_id=decoder_start_token_id,
             tie_word_embeddings=tie_word_embeddings,
-            supress_tokens=supress_tokens,
-            begin_supress_tokens=begin_supress_tokens,
+            suppress_tokens=suppress_tokens,
+            begin_suppress_tokens=begin_suppress_tokens,
             **kwargs,
         )
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 6940f051495a9..9d1bc3f9cbf8a 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -104,8 +104,8 @@ def __init__(
         num_mel_bins=80,
         decoder_start_token_id=85,
         num_conv_layers=1,
-        supress_tokens=None,
-        begin_supress_tokens=None,
+        suppress_tokens=None,
+        begin_suppress_tokens=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
@@ -129,8 +129,8 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.decoder_start_token_id = decoder_start_token_id
         self.num_conv_layers = num_conv_layers
-        self.supress_tokens = supress_tokens
-        self.begin_supress_tokens = begin_supress_tokens
+        self.suppress_tokens = suppress_tokens
+        self.begin_suppress_tokens = begin_suppress_tokens
 
     def prepare_config_and_inputs(self):
         input_features = floats_tensor([self.batch_size, self.num_mel_bins, self.seq_length], self.vocab_size)
@@ -166,8 +166,8 @@ def get_config(self):
             decoder_ffn_dim=self.hidden_size,
             encoder_ffn_dim=self.hidden_size,
             decoder_start_token_id=self.decoder_start_token_id,
-            supress_tokens=self.supress_tokens,
-            begin_supress_tokens=self.begin_supress_tokens,
+            suppress_tokens=self.suppress_tokens,
+            begin_suppress_tokens=self.begin_suppress_tokens,
         )
 
     def prepare_config_and_inputs_for_common(self):
@@ -890,8 +890,8 @@ def test_tiny_generation(self):
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
-        model.config.begin_supress_tokens = None
-        model.config.supress_tokens = None
+        model.config.begin_suppress_tokens = None
+        model.config.suppress_tokens = None
         input_speech = self._load_datasamples(1)
         feaure_extractor = WhisperFeatureExtractor()
 
@@ -940,7 +940,7 @@ def test_large_generation_multilingual(self):
         set_seed(0)
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
         model.to(torch_device)
-        model.config.supress_tokens = NON_SPEECH_TOKENS_MULTI
+        model.config.suppress_tokens = NON_SPEECH_TOKENS_MULTI
 
         ds = load_dataset("common_voice", "ja", split="test", streaming=True)
         ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
@@ -953,7 +953,7 @@ def test_large_generation_multilingual(self):
 
         tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
 
-        model.config.begin_supress_tokens = [tokenizer.encode(" ")[0], tokenizer.eos_token_id]
+        model.config.begin_suppress_tokens = [tokenizer.encode(" ")[0], tokenizer.eos_token_id]
         decoder_input_ids = torch.tensor([[50258, 50359, 50266, 50363]]).long().to(torch_device)
         generated_ids = model.generate(
             input_features,

From cef34fdf644811e2bcad4a4ead490c6041b88292 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 30 Sep 2022 15:50:46 +0000
Subject: [PATCH 129/156] last nits

---
 src/transformers/generation_logits_process.py      | 11 +++++++----
 .../models/whisper/configuration_whisper.py        |  2 +-
 .../models/whisper/english_normalizer.py           | 14 ++++++--------
 .../models/whisper/modeling_whisper.py             |  4 ++--
 .../models/whisper/processing_whisper.py           |  1 -
 tests/models/whisper/test_modeling_whisper.py      |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index ca55373a0c182..21c180312cade 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -706,10 +706,12 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
 
 class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
     r"""
-    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function's starts generating 
-    using `begin_index` tokens. This usually happens when `use_cache` us set to `True`, and the `decoder_input_ids` only include
-    the previously generated token along with and `eos_token` if `forced_eos_token` is used.
+    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function's starts
+    generating using `begin_index` tokens. This usually happens when `use_cache` us set to `True`, and the
+    `decoder_input_ids` only include the previously generated token along with and `eos_token` if `forced_eos_token` is
+    used.
     """
+
     def __init__(self, begin_suppress_tokens, begin_index):
         self.begin_suppress_tokens = list(begin_suppress_tokens)
         self.begin_index = begin_index
@@ -722,7 +724,8 @@ def __call__(self, input_ids, scores):
 
 
 class SuppressTokensLogitsProcessor(LogitsProcessor):
-    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they are not sampled."""
+    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they
+    are not sampled."""
 
     def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index c943c6c502194..438eec67c7257 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -177,7 +177,7 @@ def __init__(
         eos_token_id=50256,
         tie_word_embeddings=True,
         suppress_tokens=None,
-        begin_suppress_tokens=[220,50256],
+        begin_suppress_tokens=[220, 50256],
         **kwargs
     ):
         """_summary_
diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index f902f98f5e7bf..fcf73c402284c 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -58,21 +58,20 @@ def remove_symbols_and_diacritics(s: str, keep=""):
     """
 
     def replace_character(char):
-        if char in keep :
+        if char in keep:
             return char
         elif char in ADDITIONAL_DIACRITICS:
-            return ADDITIONAL_DIACRITICS[char] 
+            return ADDITIONAL_DIACRITICS[char]
 
         elif unicodedata.category(char) == "Mn":
             return ""
 
         elif unicodedata.category(char)[0] in "MSP":
             return " "
-        
-        return char
 
+        return char
 
-    return "".join( replace_character(c) for c in unicodedata.normalize("NFKD", s))
+    return "".join(replace_character(c) for c in unicodedata.normalize("NFKD", s))
 
 
 def remove_symbols(s: str):
@@ -120,8 +119,7 @@ def __init__(self):
         self.ones = {
             name: i
             for i, name in enumerate(
-                ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
-                ],
+                ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"],
                 start=1,
             )
         }
@@ -255,7 +253,7 @@ def output(result: Union[str, int]):
             if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
                 # arabic numbers (potentially with signs and fractions)
                 f = to_fraction(current_without_prefix)
-                if f is not None: 
+                if f is not None:
                     raise ValueError("Converting the fraction failed")
 
                 if value is not None:
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index c5f4733f5c780..ed67217bfcb0d 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -133,11 +133,11 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
-    # Copied from transformers.models.bart.modeling_bart.forward with BART->whisper
+    # Copied from transformers.models.bart.modeling_bart.BartAttention._shape with BART->whisper
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
-    # Copied from transformers.models.bart.modeling_bart.forward with BART->whisper
+    # Copied from transformers.models.bart.modeling_bart.BartAttention.forward with BART->whisper
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 914cc41554ba3..786f2dcf134fb 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -15,7 +15,6 @@
 """
 Speech processor class for Whisper
 """
-import warnings
 
 from ...processing_utils import ProcessorMixin
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 9d1bc3f9cbf8a..b15b589952479 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -931,7 +931,7 @@ def test_large_generation(self):
         )
         transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we're glad"
+        EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
     @slow

From 90c918091cce7998db5f78853eda994f83902f9b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 1 Oct 2022 05:02:02 +0000
Subject: [PATCH 130/156] update test configuration common

---
 tests/test_configuration_common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index c2d48ef662541..cf184c4d9d4de 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -84,6 +84,8 @@
     "sep_token_id": 9,
     "decoder_start_token_id": 10,
     "exponential_decay_length_penalty": (5, 1.01),
+    "suppress_tokens": None,
+    "begin_suppress_tokens": None,
     "task_specific_params": {"translation": "some_params"},
     "problem_type": "regression",
 }

From 72e86ed3c796f2a7bc46a1fcf4ce482bd7323bcb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 1 Oct 2022 05:05:40 +0000
Subject: [PATCH 131/156] add BART name in decoder attention mask documentation

---
 src/transformers/models/whisper/modeling_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ed67217bfcb0d..f217dd08596e6 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -510,7 +510,7 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
 
             If you want to change padding behavior, you should read
             [`modeling_whisper._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the
-            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+            BART paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
         head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
 

From 7d69c3cb2c15b69c8a19227ce0085e979e5c30b0 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Sat, 1 Oct 2022 07:06:49 +0200
Subject: [PATCH 132/156] Update
 src/transformers/models/whisper/modeling_whisper.py

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 src/transformers/models/whisper/modeling_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ed67217bfcb0d..255c64457209a 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1079,7 +1079,7 @@ def forward(
 
 
 @add_start_docstrings(
-    "The Whisper Model with a language modeling head. Can be used for summarization.",
+    "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.",
     WHISPER_START_DOCSTRING,
 )
 class WhisperForConditionalGeneration(WhisperPreTrainedModel):

From 84c25dc37e5de1cb3c7f606b5774db1a11ed73be Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 1 Oct 2022 05:08:14 +0000
Subject: [PATCH 133/156] style

---
 src/transformers/models/whisper/modeling_whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 04130ad1adbd7..0d5c9e9b5ab83 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -509,8 +509,8 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
             be used by default.
 
             If you want to change padding behavior, you should read
-            [`modeling_whisper._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the
-            BART paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+            [`modeling_whisper._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the BART
+            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
         head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
             Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
 

From bbf84b100569e8aeec637497b3588d52596071df Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 1 Oct 2022 05:44:13 +0000
Subject: [PATCH 134/156] nit

---
 tests/test_configuration_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index cf184c4d9d4de..a4bc494ab47be 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -84,8 +84,8 @@
     "sep_token_id": 9,
     "decoder_start_token_id": 10,
     "exponential_decay_length_penalty": (5, 1.01),
-    "suppress_tokens": None,
-    "begin_suppress_tokens": None,
+    "suppress_tokens": [0,1],
+    "begin_suppress_tokens": [2],
     "task_specific_params": {"translation": "some_params"},
     "problem_type": "regression",
 }

From f2ac0f5b58d525f0409575caf06d246a42e46d5e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 1 Oct 2022 05:44:16 +0000
Subject: [PATCH 135/156] nit

---
 tests/test_configuration_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index a4bc494ab47be..df05d2a4ac84f 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -84,8 +84,8 @@
     "sep_token_id": 9,
     "decoder_start_token_id": 10,
     "exponential_decay_length_penalty": (5, 1.01),
-    "suppress_tokens": [0,1],
-    "begin_suppress_tokens": [2],
+    "suppress_tokens": [0, 1],
+    "begin_suppress_tokens": 2,
     "task_specific_params": {"translation": "some_params"},
     "problem_type": "regression",
 }

From 93e9b2a80cb340b72a29c572fdb0a1860d2e0894 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 11:21:43 +0000
Subject: [PATCH 136/156] add english.json file to git

---
 src/transformers/models/whisper/english.json | 1742 ++++++++++++++++++
 1 file changed, 1742 insertions(+)
 create mode 100644 src/transformers/models/whisper/english.json

diff --git a/src/transformers/models/whisper/english.json b/src/transformers/models/whisper/english.json
new file mode 100644
index 0000000000000..bd84ae73faeb4
--- /dev/null
+++ b/src/transformers/models/whisper/english.json
@@ -0,0 +1,1742 @@
+{
+    "accessorise": "accessorize",
+    "accessorised": "accessorized",
+    "accessorises": "accessorizes",
+    "accessorising": "accessorizing",
+    "acclimatisation": "acclimatization",
+    "acclimatise": "acclimatize",
+    "acclimatised": "acclimatized",
+    "acclimatises": "acclimatizes",
+    "acclimatising": "acclimatizing",
+    "accoutrements": "accouterments",
+    "aeon": "eon",
+    "aeons": "eons",
+    "aerogramme": "aerogram",
+    "aerogrammes": "aerograms",
+    "aeroplane": "airplane",
+    "aeroplanes": "airplanes",
+    "aesthete": "esthete",
+    "aesthetes": "esthetes",
+    "aesthetic": "esthetic",
+    "aesthetically": "esthetically",
+    "aesthetics": "esthetics",
+    "aetiology": "etiology",
+    "ageing": "aging",
+    "aggrandisement": "aggrandizement",
+    "agonise": "agonize",
+    "agonised": "agonized",
+    "agonises": "agonizes",
+    "agonising": "agonizing",
+    "agonisingly": "agonizingly",
+    "almanack": "almanac",
+    "almanacks": "almanacs",
+    "aluminium": "aluminum",
+    "amortisable": "amortizable",
+    "amortisation": "amortization",
+    "amortisations": "amortizations",
+    "amortise": "amortize",
+    "amortised": "amortized",
+    "amortises": "amortizes",
+    "amortising": "amortizing",
+    "amphitheatre": "amphitheater",
+    "amphitheatres": "amphitheaters",
+    "anaemia": "anemia",
+    "anaemic": "anemic",
+    "anaesthesia": "anesthesia",
+    "anaesthetic": "anesthetic",
+    "anaesthetics": "anesthetics",
+    "anaesthetise": "anesthetize",
+    "anaesthetised": "anesthetized",
+    "anaesthetises": "anesthetizes",
+    "anaesthetising": "anesthetizing",
+    "anaesthetist": "anesthetist",
+    "anaesthetists": "anesthetists",
+    "anaesthetize": "anesthetize",
+    "anaesthetized": "anesthetized",
+    "anaesthetizes": "anesthetizes",
+    "anaesthetizing": "anesthetizing",
+    "analogue": "analog",
+    "analogues": "analogs",
+    "analyse": "analyze",
+    "analysed": "analyzed",
+    "analyses": "analyzes",
+    "analysing": "analyzing",
+    "anglicise": "anglicize",
+    "anglicised": "anglicized",
+    "anglicises": "anglicizes",
+    "anglicising": "anglicizing",
+    "annualised": "annualized",
+    "antagonise": "antagonize",
+    "antagonised": "antagonized",
+    "antagonises": "antagonizes",
+    "antagonising": "antagonizing",
+    "apologise": "apologize",
+    "apologised": "apologized",
+    "apologises": "apologizes",
+    "apologising": "apologizing",
+    "appal": "appall",
+    "appals": "appalls",
+    "appetiser": "appetizer",
+    "appetisers": "appetizers",
+    "appetising": "appetizing",
+    "appetisingly": "appetizingly",
+    "arbour": "arbor",
+    "arbours": "arbors",
+    "archeological": "archaeological",
+    "archaeologically": "archeologically",
+    "archaeologist": "archeologist",
+    "archaeologists": "archeologists",
+    "archaeology": "archeology</span>",
+    "ardour": "ardor",
+    "armour": "armor",
+    "armoured": "armored",
+    "armourer": "armorer",
+    "armourers": "armorers",
+    "armouries": "armories",
+    "armoury": "armory",
+    "artefact": "artifact",
+    "artefacts": "artifacts",
+    "authorise": "authorize",
+    "authorised": "authorized",
+    "authorises": "authorizes",
+    "authorising": "authorizing",
+    "axe": "ax",
+    "backpedalled": "backpedaled",
+    "backpedalling": "backpedaling",
+    "bannister": "banister",
+    "bannisters": "banisters",
+    "baptise": "baptize",
+    "baptised": "baptized",
+    "baptises": "baptizes",
+    "baptising": "baptizing",
+    "bastardise": "bastardize",
+    "bastardised": "bastardized",
+    "bastardises": "bastardizes",
+    "bastardising": "bastardizing",
+    "battleax": "battleaxe",
+    "baulk": "balk",
+    "baulked": "balked",
+    "baulking": "balking",
+    "baulks": "balks",
+    "bedevilled": "bedeviled",
+    "bedevilling": "bedeviling",
+    "behaviour": "behavior",
+    "behavioural": "behavioral",
+    "behaviourism": "behaviorism",
+    "behaviourist": "behaviorist",
+    "behaviourists": "behaviorists",
+    "behaviours": "behaviors",
+    "behove": "behoove",
+    "behoved": "behooved",
+    "behoves": "behooves",
+    "bejewelled": "bejeweled",
+    "belabour": "belabor",
+    "belaboured": "belabored",
+    "belabouring": "belaboring",
+    "belabours": "belabors",
+    "bevelled": "beveled",
+    "bevvies": "bevies",
+    "bevvy": "bevy",
+    "biassed": "biased",
+    "biassing": "biasing",
+    "bingeing": "binging",
+    "bougainvillaea": "bougainvillea",
+    "bougainvillaeas": "bougainvilleas",
+    "bowdlerise": "bowdlerize",
+    "bowdlerised": "bowdlerized",
+    "bowdlerises": "bowdlerizes",
+    "bowdlerising": "bowdlerizing",
+    "breathalyse": "breathalyze",
+    "breathalysed": "breathalyzed",
+    "breathalyser": "breathalyzer",
+    "breathalysers": "breathalyzers",
+    "breathalyses": "breathalyzes",
+    "breathalysing": "breathalyzing",
+    "brutalise": "brutalize",
+    "brutalised": "brutalized",
+    "brutalises": "brutalizes",
+    "brutalising": "brutalizing",
+    "busses": "buses",
+    "bussing": "busing",
+    "caesarean": "cesarean",
+    "caesareans": "cesareans",
+    "calibre": "caliber",
+    "calibres": "calibers",
+    "calliper": "caliper",
+    "callipers": "calipers",
+    "callisthenics": "calisthenics",
+    "canalise": "canalize",
+    "canalised": "canalized",
+    "canalises": "canalizes",
+    "canalising": "canalizing",
+    "cancelation": "cancellation",
+    "cancelations": "cancellations",
+    "cancelled": "canceled",
+    "cancelling": "canceling",
+    "candour": "candor",
+    "cannibalise": "cannibalize",
+    "cannibalised": "cannibalized",
+    "cannibalises": "cannibalizes",
+    "cannibalising": "cannibalizing",
+    "canonise": "canonize",
+    "canonised": "canonized",
+    "canonises": "canonizes",
+    "canonising": "canonizing",
+    "capitalise": "capitalize",
+    "capitalised": "capitalized",
+    "capitalises": "capitalizes",
+    "capitalising": "capitalizing",
+    "caramelise": "caramelize",
+    "caramelised": "caramelized",
+    "caramelises": "caramelizes",
+    "caramelising": "caramelizing",
+    "carbonise": "carbonize",
+    "carbonised": "carbonized",
+    "carbonises": "carbonizes",
+    "carbonising": "carbonizing",
+    "carolled": "caroled",
+    "carolling": "caroling",
+    "catalogue": "catalog",
+    "catalogued": "cataloged",
+    "catalogues": "catalogs",
+    "cataloguing": "cataloging",
+    "catalyse": "catalyze",
+    "catalysed": "catalyzed",
+    "catalyses": "catalyzes",
+    "catalysing": "catalyzing",
+    "categorise": "categorize",
+    "categorised": "categorized",
+    "categorises": "categorizes",
+    "categorising": "categorizing",
+    "cauterise": "cauterize",
+    "cauterised": "cauterized",
+    "cauterises": "cauterizes",
+    "cauterising": "cauterizing",
+    "cavilled": "caviled",
+    "cavilling": "caviling",
+    "centigramme": "centigram",
+    "centigrammes": "centigrams",
+    "centilitre": "centiliter",
+    "centilitres": "centiliters",
+    "centimetre": "centimeter",
+    "centimetres": "centimeters",
+    "centralise": "centralize",
+    "centralised": "centralized",
+    "centralises": "centralizes",
+    "centralising": "centralizing",
+    "centre": "center",
+    "centred": "centered",
+    "centrefold": "centerfold",
+    "centrefolds": "centerfolds",
+    "centrepiece": "centerpiece",
+    "centrepieces": "centerpieces",
+    "centres": "centers",
+    "channelled": "channeled",
+    "channelling": "channeling",
+    "characterise": "characterize",
+    "characterised": "characterized",
+    "characterises": "characterizes",
+    "characterising": "characterizing",
+    "cheque": "check",
+    "chequebook": "checkbook",
+    "chequebooks": "checkbooks",
+    "chequered": "checkered",
+    "cheques": "checks",
+    "chilli": "chili",
+    "chimaera": "chimera",
+    "chimaeras": "chimeras",
+    "chiselled": "chiseled",
+    "chiselling": "chiseling",
+    "circularise": "circularize",
+    "circularised": "circularized",
+    "circularises": "circularizes",
+    "circularising": "circularizing",
+    "civilise": "civilize",
+    "civilised": "civilized",
+    "civilises": "civilizes",
+    "civilising": "civilizing",
+    "clamour": "clamor",
+    "clamoured": "clamored",
+    "clamouring": "clamoring",
+    "clamours": "clamors",
+    "clangour": "clangor",
+    "clarinettist": "clarinetist",
+    "clarinettists": "clarinetists",
+    "collectivise": "collectivize",
+    "collectivised": "collectivized",
+    "collectivises": "collectivizes",
+    "collectivising": "collectivizing",
+    "colonisation": "colonization",
+    "colonise": "colonize",
+    "colonised": "colonized",
+    "coloniser": "colonizer",
+    "colonisers": "colonizers",
+    "colonises": "colonizes",
+    "colonising": "colonizing",
+    "colour": "color",
+    "colourant": "colorant",
+    "colourants": "colorants",
+    "coloured": "colored",
+    "coloureds": "coloreds",
+    "colourful": "colorful",
+    "colourfully": "colorfully",
+    "colouring": "coloring",
+    "colourize": "colorize",
+    "colourized": "colorized",
+    "colourizes": "colorizes",
+    "colourizing": "colorizing",
+    "colourless": "colorless",
+    "colours": "colors",
+    "commercialise": "commercialize",
+    "commercialised": "commercialized",
+    "commercialises": "commercializes",
+    "commercialising": "commercializing",
+    "compartmentalise": "compartmentalize",
+    "compartmentalised": "compartmentalized",
+    "compartmentalises": "compartmentalizes",
+    "compartmentalising": "compartmentalizing",
+    "computerise": "computerize",
+    "computerised": "computerized",
+    "computerises": "computerizes",
+    "computerising": "computerizing",
+    "conceptualise": "conceptualize",
+    "conceptualised": "conceptualized",
+    "conceptualises": "conceptualizes",
+    "conceptualising": "conceptualizing",
+    "connexion": "connection",
+    "connexions": "connections",
+    "contextualise": "contextualize",
+    "contextualised": "contextualized",
+    "contextualises": "contextualizes",
+    "contextualising": "contextualizing",
+    "cosier": "cozier",
+    "cosies": "cozies",
+    "cosiest": "coziest",
+    "cosily": "cozily",
+    "cosiness": "coziness",
+    "cosy": "cozy",
+    "councillor": "councilor",
+    "councillors": "councilors",
+    "counselled": "counseled",
+    "counselling": "counseling",
+    "counsellor": "counselor",
+    "counsellors": "counselors",
+    "crenelated": "crenellated",
+    "criminalise": "criminalize",
+    "criminalised": "criminalized",
+    "criminalises": "criminalizes",
+    "criminalising": "criminalizing",
+    "criticise": "criticize",
+    "criticised": "criticized",
+    "criticises": "criticizes",
+    "criticising": "criticizing",
+    "crueller": "crueler",
+    "cruellest": "cruelest",
+    "crystallisation": "crystallization",
+    "crystallise": "crystallize",
+    "crystallised": "crystallized",
+    "crystallises": "crystallizes",
+    "crystallising": "crystallizing",
+    "cudgelled": "cudgeled",
+    "cudgelling": "cudgeling",
+    "customise": "customize",
+    "customised": "customized",
+    "customises": "customizes",
+    "customising": "customizing",
+    "cypher": "cipher",
+    "cyphers": "ciphers",
+    "decentralisation": "decentralization",
+    "decentralise": "decentralize",
+    "decentralised": "decentralized",
+    "decentralises": "decentralizes",
+    "decentralising": "decentralizing",
+    "decriminalisation": "decriminalization",
+    "decriminalise": "decriminalize",
+    "decriminalised": "decriminalized",
+    "decriminalises": "decriminalizes",
+    "decriminalising": "decriminalizing",
+    "defence": "defense",
+    "defenceless": "defenseless",
+    "defences": "defenses",
+    "dehumanisation": "dehumanization",
+    "dehumanise": "dehumanize",
+    "dehumanised": "dehumanized",
+    "dehumanises": "dehumanizes",
+    "dehumanising": "dehumanizing",
+    "demeanour": "demeanor",
+    "demilitarisation": "demilitarization",
+    "demilitarise": "demilitarize",
+    "demilitarised": "demilitarized",
+    "demilitarises": "demilitarizes",
+    "demilitarising": "demilitarizing",
+    "demobilisation": "demobilization",
+    "demobilise": "demobilize",
+    "demobilised": "demobilized",
+    "demobilises": "demobilizes",
+    "demobilising": "demobilizing",
+    "democratisation": "democratization",
+    "democratise": "democratize",
+    "democratised": "democratized",
+    "democratises": "democratizes",
+    "democratising": "democratizing",
+    "demonise": "demonize",
+    "demonised": "demonized",
+    "demonises": "demonizes",
+    "demonising": "demonizing",
+    "demoralisation": "demoralization",
+    "demoralise": "demoralize",
+    "demoralised": "demoralized",
+    "demoralises": "demoralizes",
+    "demoralising": "demoralizing",
+    "denationalisation": "denationalization",
+    "denationalise": "denationalize",
+    "denationalised": "denationalized",
+    "denationalises": "denationalizes",
+    "denationalising": "denationalizing",
+    "deodorise": "deodorize",
+    "deodorised": "deodorized",
+    "deodorises": "deodorizes",
+    "deodorising": "deodorizing",
+    "depersonalise": "depersonalize",
+    "depersonalised": "depersonalized",
+    "depersonalises": "depersonalizes",
+    "depersonalising": "depersonalizing",
+    "deputise": "deputize",
+    "deputised": "deputized",
+    "deputises": "deputizes",
+    "deputising": "deputizing",
+    "desensitisation": "desensitization",
+    "desensitise": "desensitize",
+    "desensitised": "desensitized",
+    "desensitises": "desensitizes",
+    "desensitising": "desensitizing",
+    "destabilisation": "destabilization",
+    "destabilise": "destabilize",
+    "destabilised": "destabilized",
+    "destabilises": "destabilizes",
+    "destabilising": "destabilizing",
+    "dialled": "dialed",
+    "dialling": "dialing",
+    "dialogue": "dialog",
+    "dialogues": "dialogs",
+    "diarrhoea": "diarrhea",
+    "digitise": "digitize",
+    "digitised": "digitized",
+    "digitises": "digitizes",
+    "digitising": "digitizing",
+    "disc": "disk",
+    "discolour": "discolor",
+    "discoloured": "discolored",
+    "discolouring": "discoloring",
+    "discolours": "discolors",
+    "discs": "disks",
+    "disembowelled": "disemboweled",
+    "disembowelling": "disemboweling",
+    "disfavour": "disfavor",
+    "dishevelled": "disheveled",
+    "dishonour": "dishonor",
+    "dishonourable": "dishonorable",
+    "dishonourably": "dishonorably",
+    "dishonoured": "dishonored",
+    "dishonouring": "dishonoring",
+    "dishonours": "dishonors",
+    "disorganisation": "disorganization",
+    "disorganised": "disorganized",
+    "distil": "distill",
+    "distils": "distills",
+    "dramatisation": "dramatization",
+    "dramatisations": "dramatizations",
+    "dramatise": "dramatize",
+    "dramatised": "dramatized",
+    "dramatises": "dramatizes",
+    "dramatising": "dramatizing",
+    "draught": "draft",
+    "draughtboard": "draftboard",
+    "draughtboards": "draftboards",
+    "draughtier": "draftier",
+    "draughtiest": "draftiest",
+    "draughts": "drafts",
+    "draughtsman": "draftsman",
+    "draughtsmanship": "draftsmanship",
+    "draughtsmen": "draftsmen",
+    "draughtswoman": "draftswoman",
+    "draughtswomen": "draftswomen",
+    "draughty": "drafty",
+    "drivelled": "driveled",
+    "drivelling": "driveling",
+    "duelled": "dueled",
+    "duelling": "dueling",
+    "economise": "economize",
+    "economised": "economized",
+    "economises": "economizes",
+    "economising": "economizing",
+    "edoema": "edema",
+    "editorialise": "editorialize",
+    "editorialised": "editorialized",
+    "editorialises": "editorializes",
+    "editorialising": "editorializing",
+    "empathise": "empathize",
+    "empathised": "empathized",
+    "empathises": "empathizes",
+    "empathising": "empathizing",
+    "emphasise": "emphasize",
+    "emphasised": "emphasized",
+    "emphasises": "emphasizes",
+    "emphasising": "emphasizing",
+    "enamelled": "enameled",
+    "enamelling": "enameling",
+    "enamoured": "enamored",
+    "encyclopaedia": "encyclopedia",
+    "encyclopaedias": "encyclopedias",
+    "encyclopaedic": "encyclopedic",
+    "endeavour": "endeavor",
+    "endeavoured": "endeavored",
+    "endeavouring": "endeavoring",
+    "endeavours": "endeavors",
+    "energise": "energize",
+    "energised": "energized",
+    "energises": "energizes",
+    "energising": "energizing",
+    "enrol": "enroll",
+    "enrols": "enrolls",
+    "enthral": "enthrall",
+    "enthrals": "enthralls",
+    "epaulette": "epaulet",
+    "epaulettes": "epaulets",
+    "epicentre": "epicenter",
+    "epicentres": "epicenters",
+    "epilogue": "epilog",
+    "epilogues": "epilogs",
+    "epitomise": "epitomize",
+    "epitomised": "epitomized",
+    "epitomises": "epitomizes",
+    "epitomising": "epitomizing",
+    "equalisation": "equalization",
+    "equalise": "equalize",
+    "equalised": "equalized",
+    "equaliser": "equalizer",
+    "equalisers": "equalizers",
+    "equalises": "equalizes",
+    "equalising": "equalizing",
+    "eulogise": "eulogize",
+    "eulogised": "eulogized",
+    "eulogises": "eulogizes",
+    "eulogising": "eulogizing",
+    "evangelise": "evangelize",
+    "evangelised": "evangelized",
+    "evangelises": "evangelizes",
+    "evangelising": "evangelizing",
+    "exorcise": "exorcize",
+    "exorcised": "exorcized",
+    "exorcises": "exorcizes",
+    "exorcising": "exorcizing",
+    "extemporisation": "extemporization",
+    "extemporise": "extemporize",
+    "extemporised": "extemporized",
+    "extemporises": "extemporizes",
+    "extemporising": "extemporizing",
+    "externalisation": "externalization",
+    "externalisations": "externalizations",
+    "externalise": "externalize",
+    "externalised": "externalized",
+    "externalises": "externalizes",
+    "externalising": "externalizing",
+    "factorise": "factorize",
+    "factorised": "factorized",
+    "factorises": "factorizes",
+    "factorising": "factorizing",
+    "faecal": "fecal",
+    "faeces": "feces",
+    "familiarisation": "familiarization",
+    "familiarise": "familiarize",
+    "familiarised": "familiarized",
+    "familiarises": "familiarizes",
+    "familiarising": "familiarizing",
+    "fantasise": "fantasize",
+    "fantasised": "fantasized",
+    "fantasises": "fantasizes",
+    "fantasising": "fantasizing",
+    "favour": "favor",
+    "favourable": "favorable",
+    "favourably": "favorably",
+    "favoured": "favored",
+    "favouring": "favoring",
+    "favourite": "favorite",
+    "favourites": "favorites",
+    "favouritism": "favoritism",
+    "favours": "favors",
+    "feminise": "feminize",
+    "feminised": "feminized",
+    "feminises": "feminizes",
+    "feminising": "feminizing",
+    "fertilisation": "fertilization",
+    "fertilise": "fertilize",
+    "fertilised": "fertilized",
+    "fertiliser": "fertilizer",
+    "fertilisers": "fertilizers",
+    "fertilises": "fertilizes",
+    "fertilising": "fertilizing",
+    "fervour": "fervor",
+    "fibre": "fiber",
+    "fibreglass": "fiberglass",
+    "fibres": "fibers",
+    "fictionalisation": "fictionalization",
+    "fictionalisations": "fictionalizations",
+    "fictionalise": "fictionalize",
+    "fictionalised": "fictionalized",
+    "fictionalises": "fictionalizes",
+    "fictionalising": "fictionalizing",
+    "fillet": "filet",
+    "filleted": "fileted",
+    "filleting": "fileting",
+    "fillets": "filets",
+    "finalisation": "finalization",
+    "finalise": "finalize",
+    "finalised": "finalized",
+    "finalises": "finalizes",
+    "finalising": "finalizing",
+    "flautist": "flutist",
+    "flautists": "flutists",
+    "flavour": "flavor",
+    "flavoured": "flavored",
+    "flavouring": "flavoring",
+    "flavourings": "flavorings",
+    "flavourless": "flavorless",
+    "flavours": "flavors",
+    "flavoursome": "flavorsome",
+    "flyer / flier": "flier / flyer",
+    "foetal": "fetal",
+    "foetid": "fetid",
+    "foetus": "fetus",
+    "foetuses": "fetuses",
+    "formalisation": "formalization",
+    "formalise": "formalize",
+    "formalised": "formalized",
+    "formalises": "formalizes",
+    "formalising": "formalizing",
+    "fossilisation": "fossilization",
+    "fossilise": "fossilize",
+    "fossilised": "fossilized",
+    "fossilises": "fossilizes",
+    "fossilising": "fossilizing",
+    "fraternisation": "fraternization",
+    "fraternise": "fraternize",
+    "fraternised": "fraternized",
+    "fraternises": "fraternizes",
+    "fraternising": "fraternizing",
+    "fulfil": "fulfill",
+    "fulfilment": "fulfillment",
+    "fulfils": "fulfills",
+    "funnelled": "funneled",
+    "funnelling": "funneling",
+    "galvanise": "galvanize",
+    "galvanised": "galvanized",
+    "galvanises": "galvanizes",
+    "galvanising": "galvanizing",
+    "gambolled": "gamboled",
+    "gambolling": "gamboling",
+    "gaol": "jail",
+    "gaolbird": "jailbird",
+    "gaolbirds": "jailbirds",
+    "gaolbreak": "jailbreak",
+    "gaolbreaks": "jailbreaks",
+    "gaoled": "jailed",
+    "gaoler": "jailer",
+    "gaolers": "jailers",
+    "gaoling": "jailing",
+    "gaols": "jails",
+    "gasses": "gases",
+    "gage": "gauge",
+    "gaged": "gauged",
+    "gages": "gauges",
+    "gaging": "gauging",
+    "generalisation": "generalization",
+    "generalisations": "generalizations",
+    "generalise": "generalize",
+    "generalised": "generalized",
+    "generalises": "generalizes",
+    "generalising": "generalizing",
+    "ghettoise": "ghettoize",
+    "ghettoised": "ghettoized",
+    "ghettoises": "ghettoizes",
+    "ghettoising": "ghettoizing",
+    "gipsies": "gypsies",
+    "glamorise": "glamorize",
+    "glamorised": "glamorized",
+    "glamorises": "glamorizes",
+    "glamorising": "glamorizing",
+    "glamor": "glamour",
+    "globalisation": "globalization",
+    "globalise": "globalize",
+    "globalised": "globalized",
+    "globalises": "globalizes",
+    "globalising": "globalizing",
+    "glueing": "gluing",
+    "goitre": "goiter",
+    "goitres": "goiters",
+    "gonorrhoea": "gonorrhea",
+    "gramme": "gram",
+    "grammes": "grams",
+    "gravelled": "graveled",
+    "grey": "gray",
+    "greyed": "grayed",
+    "greying": "graying",
+    "greyish": "grayish",
+    "greyness": "grayness",
+    "greys": "grays",
+    "grovelled": "groveled",
+    "grovelling": "groveling",
+    "groyne": "groin",
+    "groynes": "groins",
+    "gruelling": "grueling",
+    "gruellingly": "gruelingly",
+    "gryphon": "griffin",
+    "gryphons": "griffins",
+    "gynaecological": "gynecological",
+    "gynaecologist": "gynecologist",
+    "gynaecologists": "gynecologists",
+    "gynaecology": "gynecology",
+    "haematological": "hematological",
+    "haematologist": "hematologist",
+    "haematologists": "hematologists",
+    "haematology": "hematology",
+    "haemoglobin": "hemoglobin",
+    "haemophilia": "hemophilia",
+    "haemophiliac": "hemophiliac",
+    "haemophiliacs": "hemophiliacs",
+    "haemorrhage": "hemorrhage",
+    "haemorrhaged": "hemorrhaged",
+    "haemorrhages": "hemorrhages",
+    "haemorrhaging": "hemorrhaging",
+    "haemorrhoids": "hemorrhoids",
+    "harbour": "harbor",
+    "harboured": "harbored",
+    "harbouring": "harboring",
+    "harbours": "harbors",
+    "harmonisation": "harmonization",
+    "harmonise": "harmonize",
+    "harmonised": "harmonized",
+    "harmonises": "harmonizes",
+    "harmonising": "harmonizing",
+    "homoeopath": "homeopath",
+    "homoeopathic": "homeopathic",
+    "homoeopaths": "homeopaths",
+    "homoeopathy": "homeopathy",
+    "homogenise": "homogenize",
+    "homogenised": "homogenized",
+    "homogenises": "homogenizes",
+    "homogenising": "homogenizing",
+    "honour": "honor",
+    "honourable": "honorable",
+    "honourably": "honorably",
+    "honoured": "honored",
+    "honouring": "honoring",
+    "honours": "honors",
+    "hospitalisation": "hospitalization",
+    "hospitalise": "hospitalize",
+    "hospitalised": "hospitalized",
+    "hospitalises": "hospitalizes",
+    "hospitalising": "hospitalizing",
+    "humanise": "humanize",
+    "humanised": "humanized",
+    "humanises": "humanizes",
+    "humanising": "humanizing",
+    "humour": "humor",
+    "humoured": "humored",
+    "humouring": "humoring",
+    "humourless": "humorless",
+    "humours": "humors",
+    "hybridise": "hybridize",
+    "hybridised": "hybridized",
+    "hybridises": "hybridizes",
+    "hybridising": "hybridizing",
+    "hypnotise": "hypnotize",
+    "hypnotised": "hypnotized",
+    "hypnotises": "hypnotizes",
+    "hypnotising": "hypnotizing",
+    "hypothesise": "hypothesize",
+    "hypothesised": "hypothesized",
+    "hypothesises": "hypothesizes",
+    "hypothesising": "hypothesizing",
+    "idealisation": "idealization",
+    "idealise": "idealize",
+    "idealised": "idealized",
+    "idealises": "idealizes",
+    "idealising": "idealizing",
+    "idolise": "idolize",
+    "idolised": "idolized",
+    "idolises": "idolizes",
+    "idolising": "idolizing",
+    "immobilisation": "immobilization",
+    "immobilise": "immobilize",
+    "immobilised": "immobilized",
+    "immobiliser": "immobilizer",
+    "immobilisers": "immobilizers",
+    "immobilises": "immobilizes",
+    "immobilising": "immobilizing",
+    "immortalise": "immortalize",
+    "immortalised": "immortalized",
+    "immortalises": "immortalizes",
+    "immortalising": "immortalizing",
+    "immunisation": "immunization",
+    "immunise": "immunize",
+    "immunised": "immunized",
+    "immunises": "immunizes",
+    "immunising": "immunizing",
+    "impanelled": "impaneled",
+    "impanelling": "impaneling",
+    "imperilled": "imperiled",
+    "imperilling": "imperiling",
+    "individualise": "individualize",
+    "individualised": "individualized",
+    "individualises": "individualizes",
+    "individualising": "individualizing",
+    "industrialise": "industrialize",
+    "industrialised": "industrialized",
+    "industrialises": "industrializes",
+    "industrialising": "industrializing",
+    "inflexion": "inflection",
+    "inflexions": "inflections",
+    "initialise": "initialize",
+    "initialised": "initialized",
+    "initialises": "initializes",
+    "initialising": "initializing",
+    "initialled": "initialed",
+    "initialling": "initialing",
+    "instal": "install",
+    "instalment": "installment",
+    "instalments": "installments",
+    "instals": "installs",
+    "instil": "instill",
+    "instils": "instills",
+    "institutionalisation": "institutionalization",
+    "institutionalise": "institutionalize",
+    "institutionalised": "institutionalized",
+    "institutionalises": "institutionalizes",
+    "institutionalising": "institutionalizing",
+    "intellectualise": "intellectualize",
+    "intellectualised": "intellectualized",
+    "intellectualises": "intellectualizes",
+    "intellectualising": "intellectualizing",
+    "internalisation": "internalization",
+    "internalise": "internalize",
+    "internalised": "internalized",
+    "internalises": "internalizes",
+    "internalising": "internalizing",
+    "internationalisation": "internationalization",
+    "internationalise": "internationalize",
+    "internationalised": "internationalized",
+    "internationalises": "internationalizes",
+    "internationalising": "internationalizing",
+    "ionisation": "ionization",
+    "ionise": "ionize",
+    "ionised": "ionized",
+    "ioniser": "ionizer",
+    "ionisers": "ionizers",
+    "ionises": "ionizes",
+    "ionising": "ionizing",
+    "italicise": "italicize",
+    "italicised": "italicized",
+    "italicises": "italicizes",
+    "italicising": "italicizing",
+    "itemise": "itemize",
+    "itemised": "itemized",
+    "itemises": "itemizes",
+    "itemising": "itemizing",
+    "jeopardise": "jeopardize",
+    "jeopardised": "jeopardized",
+    "jeopardises": "jeopardizes",
+    "jeopardising": "jeopardizing",
+    "jewelled": "jeweled",
+    "jeweller": "jeweler",
+    "jewellers": "jewelers",
+    "jewellery": "jewelry",
+    "judgement": "judgment",
+    "kilogramme": "kilogram",
+    "kilogrammes": "kilograms",
+    "kilometre": "kilometer",
+    "kilometres": "kilometers",
+    "labelled": "labeled",
+    "labelling": "labeling",
+    "labour": "labor",
+    "laboured": "labored",
+    "labourer": "laborer",
+    "labourers": "laborers",
+    "labouring": "laboring",
+    "labours": "labors",
+    "lacklustre": "lackluster",
+    "legalisation": "legalization",
+    "legalise": "legalize",
+    "legalised": "legalized",
+    "legalises": "legalizes",
+    "legalising": "legalizing",
+    "legitimise": "legitimize",
+    "legitimised": "legitimized",
+    "legitimises": "legitimizes",
+    "legitimising": "legitimizing",
+    "leukaemia": "leukemia",
+    "levelled": "leveled",
+    "leveller": "leveler",
+    "levellers": "levelers",
+    "levelling": "leveling",
+    "libelled": "libeled",
+    "libelling": "libeling",
+    "libellous": "libelous",
+    "liberalisation": "liberalization",
+    "liberalise": "liberalize",
+    "liberalised": "liberalized",
+    "liberalises": "liberalizes",
+    "liberalising": "liberalizing",
+    "licence": "license",
+    "licenced": "licensed",
+    "licences": "licenses",
+    "licencing": "licensing",
+    "likeable": "likable",
+    "lionisation": "lionization",
+    "lionise": "lionize",
+    "lionised": "lionized",
+    "lionises": "lionizes",
+    "lionising": "lionizing",
+    "liquidise": "liquidize",
+    "liquidised": "liquidized",
+    "liquidiser": "liquidizer",
+    "liquidisers": "liquidizers",
+    "liquidises": "liquidizes",
+    "liquidising": "liquidizing",
+    "litre": "liter",
+    "litres": "liters",
+    "localise": "localize",
+    "localised": "localized",
+    "localises": "localizes",
+    "localising": "localizing",
+    "louvre": "louver",
+    "louvred": "louvered",
+    "louvres": "louvers",
+    "lustre": "luster",
+    "magnetise": "magnetize",
+    "magnetised": "magnetized",
+    "magnetises": "magnetizes",
+    "magnetising": "magnetizing",
+    "manoeuvrability": "maneuverability",
+    "manoeuvrable": "maneuverable",
+    "manoeuvre": "maneuver",
+    "manoeuvred": "maneuvered",
+    "manoeuvres": "maneuvers",
+    "manoeuvring": "maneuvering",
+    "manoeuvrings": "maneuverings",
+    "marginalisation": "marginalization",
+    "marginalise": "marginalize",
+    "marginalised": "marginalized",
+    "marginalises": "marginalizes",
+    "marginalising": "marginalizing",
+    "marshalled": "marshaled",
+    "marshalling": "marshaling",
+    "marvelled": "marveled",
+    "marvelling": "marveling",
+    "marvellous": "marvelous",
+    "marvellously": "marvelously",
+    "materialisation": "materialization",
+    "materialise": "materialize",
+    "materialised": "materialized",
+    "materialises": "materializes",
+    "materialising": "materializing",
+    "maximisation": "maximization",
+    "maximise": "maximize",
+    "maximised": "maximized",
+    "maximises": "maximizes",
+    "maximising": "maximizing",
+    "meagre": "meager",
+    "mechanisation": "mechanization",
+    "mechanise": "mechanize",
+    "mechanised": "mechanized",
+    "mechanises": "mechanizes",
+    "mechanising": "mechanizing",
+    "mediaeval": "medieval",
+    "memorialise": "memorialize",
+    "memorialised": "memorialized",
+    "memorialises": "memorializes",
+    "memorialising": "memorializing",
+    "memorise": "memorize",
+    "memorised": "memorized",
+    "memorises": "memorizes",
+    "memorising": "memorizing",
+    "mesmerise": "mesmerize",
+    "mesmerised": "mesmerized",
+    "mesmerises": "mesmerizes",
+    "mesmerising": "mesmerizing",
+    "metabolise": "metabolize",
+    "metabolised": "metabolized",
+    "metabolises": "metabolizes",
+    "metabolising": "metabolizing",
+    "metre": "meter",
+    "metres": "meters",
+    "micrometre": "micrometer",
+    "micrometres": "micrometers",
+    "militarise": "militarize",
+    "militarised": "militarized",
+    "militarises": "militarizes",
+    "militarising": "militarizing",
+    "milligramme": "milligram",
+    "milligrammes": "milligrams",
+    "millilitre": "milliliter",
+    "millilitres": "milliliters",
+    "millimetre": "millimeter",
+    "millimetres": "millimeters",
+    "miniaturisation": "miniaturization",
+    "miniaturise": "miniaturize",
+    "miniaturised": "miniaturized",
+    "miniaturises": "miniaturizes",
+    "miniaturising": "miniaturizing",
+    "minibusses": "minibuses",
+    "minimise": "minimize",
+    "minimised": "minimized",
+    "minimises": "minimizes",
+    "minimising": "minimizing",
+    "misbehaviour": "misbehavior",
+    "misdemeanour": "misdemeanor",
+    "misdemeanours": "misdemeanors",
+    "misspelt": "misspelled",
+    "mitre": "miter",
+    "mitres": "miters",
+    "mobilisation": "mobilization",
+    "mobilise": "mobilize",
+    "mobilised": "mobilized",
+    "mobilises": "mobilizes",
+    "mobilising": "mobilizing",
+    "modelled": "modeled",
+    "modeller": "modeler",
+    "modellers": "modelers",
+    "modelling": "modeling",
+    "modernise": "modernize",
+    "modernised": "modernized",
+    "modernises": "modernizes",
+    "modernising": "modernizing",
+    "moisturise": "moisturize",
+    "moisturised": "moisturized",
+    "moisturiser": "moisturizer",
+    "moisturisers": "moisturizers",
+    "moisturises": "moisturizes",
+    "moisturising": "moisturizing",
+    "monologue": "monolog",
+    "monologues": "monologs",
+    "monopolisation": "monopolization",
+    "monopolise": "monopolize",
+    "monopolised": "monopolized",
+    "monopolises": "monopolizes",
+    "monopolising": "monopolizing",
+    "moralise": "moralize",
+    "moralised": "moralized",
+    "moralises": "moralizes",
+    "moralising": "moralizing",
+    "motorised": "motorized",
+    "mould": "mold",
+    "moulded": "molded",
+    "moulder": "molder",
+    "mouldered": "moldered",
+    "mouldering": "moldering",
+    "moulders": "molders",
+    "mouldier": "moldier",
+    "mouldiest": "moldiest",
+    "moulding": "molding",
+    "mouldings": "moldings",
+    "moulds": "molds",
+    "mouldy": "moldy",
+    "moult": "molt",
+    "moulted": "molted",
+    "moulting": "molting",
+    "moults": "molts",
+    "moustache": "mustache",
+    "moustached": "mustached",
+    "moustaches": "mustaches",
+    "moustachioed": "mustachioed",
+    "multicoloured": "multicolored",
+    "nationalisation": "nationalization",
+    "nationalisations": "nationalizations",
+    "nationalise": "nationalize",
+    "nationalised": "nationalized",
+    "nationalises": "nationalizes",
+    "nationalising": "nationalizing",
+    "naturalisation": "naturalization",
+    "naturalise": "naturalize",
+    "naturalised": "naturalized",
+    "naturalises": "naturalizes",
+    "naturalising": "naturalizing",
+    "neighbour": "neighbor",
+    "neighbourhood": "neighborhood",
+    "neighbourhoods": "neighborhoods",
+    "neighbouring": "neighboring",
+    "neighbourliness": "neighborliness",
+    "neighbourly": "neighborly",
+    "neighbours": "neighbors",
+    "neutralisation": "neutralization",
+    "neutralise": "neutralize",
+    "neutralised": "neutralized",
+    "neutralises": "neutralizes",
+    "neutralising": "neutralizing",
+    "normalisation": "normalization",
+    "normalise": "normalize",
+    "normalised": "normalized",
+    "normalises": "normalizes",
+    "normalising": "normalizing",
+    "odour": "odor",
+    "odourless": "odorless",
+    "odours": "odors",
+    "oesophagus": "esophagus",
+    "oesophaguses": "esophaguses",
+    "oestrogen": "estrogen",
+    "offence": "offense",
+    "offences": "offenses",
+    "omelette": "omelet",
+    "omelettes": "omelets",
+    "optimise": "optimize",
+    "optimised": "optimized",
+    "optimises": "optimizes",
+    "optimising": "optimizing",
+    "organisation": "organization",
+    "organisational": "organizational",
+    "organisations": "organizations",
+    "organise": "organize",
+    "organised": "organized",
+    "organiser": "organizer",
+    "organisers": "organizers",
+    "organises": "organizes",
+    "organising": "organizing",
+    "orthopaedic": "orthopedic",
+    "orthopaedics": "orthopedics",
+    "ostracise": "ostracize",
+    "ostracised": "ostracized",
+    "ostracises": "ostracizes",
+    "ostracising": "ostracizing",
+    "outmanoeuvre": "outmaneuver",
+    "outmanoeuvred": "outmaneuvered",
+    "outmanoeuvres": "outmaneuvers",
+    "outmanoeuvring": "outmaneuvering",
+    "overemphasise": "overemphasize",
+    "overemphasised": "overemphasized",
+    "overemphasises": "overemphasizes",
+    "overemphasising": "overemphasizing",
+    "oxidisation": "oxidization",
+    "oxidise": "oxidize",
+    "oxidised": "oxidized",
+    "oxidises": "oxidizes",
+    "oxidising": "oxidizing",
+    "paederast": "pederast",
+    "paederasts": "pederasts",
+    "paediatric": "pediatric",
+    "paediatrician": "pediatrician",
+    "paediatricians": "pediatricians",
+    "paediatrics": "pediatrics",
+    "paedophile": "pedophile",
+    "paedophiles": "pedophiles",
+    "paedophilia": "pedophilia",
+    "palaeolithic": "paleolithic",
+    "palaeontologist": "paleontologist",
+    "palaeontologists": "paleontologists",
+    "palaeontology": "paleontology",
+    "panelled": "paneled",
+    "panelling": "paneling",
+    "panellist": "panelist",
+    "panellists": "panelists",
+    "paralyse": "paralyze",
+    "paralysed": "paralyzed",
+    "paralyses": "paralyzes",
+    "paralysing": "paralyzing",
+    "parcelled": "parceled",
+    "parcelling": "parceling",
+    "parlour": "parlor",
+    "parlours": "parlors",
+    "particularise": "particularize",
+    "particularised": "particularized",
+    "particularises": "particularizes",
+    "particularising": "particularizing",
+    "passivisation": "passivization",
+    "passivise": "passivize",
+    "passivised": "passivized",
+    "passivises": "passivizes",
+    "passivising": "passivizing",
+    "pasteurisation": "pasteurization",
+    "pasteurise": "pasteurize",
+    "pasteurised": "pasteurized",
+    "pasteurises": "pasteurizes",
+    "pasteurising": "pasteurizing",
+    "patronise": "patronize",
+    "patronised": "patronized",
+    "patronises": "patronizes",
+    "patronising": "patronizing",
+    "patronisingly": "patronizingly",
+    "pedalled": "pedaled",
+    "pedalling": "pedaling",
+    "pedestrianisation": "pedestrianization",
+    "pedestrianise": "pedestrianize",
+    "pedestrianised": "pedestrianized",
+    "pedestrianises": "pedestrianizes",
+    "pedestrianising": "pedestrianizing",
+    "penalise": "penalize",
+    "penalised": "penalized",
+    "penalises": "penalizes",
+    "penalising": "penalizing",
+    "pencilled": "penciled",
+    "pencilling": "penciling",
+    "personalise": "personalize",
+    "personalised": "personalized",
+    "personalises": "personalizes",
+    "personalising": "personalizing",
+    "pharmacopoeia": "pharmacopeia",
+    "pharmacopoeias": "pharmacopeias",
+    "philosophise": "philosophize",
+    "philosophised": "philosophized",
+    "philosophises": "philosophizes",
+    "philosophising": "philosophizing",
+    "philtre": "filter",
+    "philtres": "filters",
+    "phoney": "phony",
+    "plagiarise": "plagiarize",
+    "plagiarised": "plagiarized",
+    "plagiarises": "plagiarizes",
+    "plagiarising": "plagiarizing",
+    "plough": "plow",
+    "ploughed": "plowed",
+    "ploughing": "plowing",
+    "ploughman": "plowman",
+    "ploughmen": "plowmen",
+    "ploughs": "plows",
+    "ploughshare": "plowshare",
+    "ploughshares": "plowshares",
+    "polarisation": "polarization",
+    "polarise": "polarize",
+    "polarised": "polarized",
+    "polarises": "polarizes",
+    "polarising": "polarizing",
+    "politicisation": "politicization",
+    "politicise": "politicize",
+    "politicised": "politicized",
+    "politicises": "politicizes",
+    "politicising": "politicizing",
+    "popularisation": "popularization",
+    "popularise": "popularize",
+    "popularised": "popularized",
+    "popularises": "popularizes",
+    "popularising": "popularizing",
+    "pouffe": "pouf",
+    "pouffes": "poufs",
+    "practise": "practice",
+    "practised": "practiced",
+    "practises": "practices",
+    "practising": "practicing",
+    "praesidium": "presidium",
+    "praesidiums": "presidiums",
+    "pressurisation": "pressurization",
+    "pressurise": "pressurize",
+    "pressurised": "pressurized",
+    "pressurises": "pressurizes",
+    "pressurising": "pressurizing",
+    "pretence": "pretense",
+    "pretences": "pretenses",
+    "primaeval": "primeval",
+    "prioritisation": "prioritization",
+    "prioritise": "prioritize",
+    "prioritised": "prioritized",
+    "prioritises": "prioritizes",
+    "prioritising": "prioritizing",
+    "privatisation": "privatization",
+    "privatisations": "privatizations",
+    "privatise": "privatize",
+    "privatised": "privatized",
+    "privatises": "privatizes",
+    "privatising": "privatizing",
+    "professionalisation": "professionalization",
+    "professionalise": "professionalize",
+    "professionalised": "professionalized",
+    "professionalises": "professionalizes",
+    "professionalising": "professionalizing",
+    "programme": "program",
+    "programmes": "programs",
+    "prologue": "prolog",
+    "prologues": "prologs",
+    "propagandise": "propagandize",
+    "propagandised": "propagandized",
+    "propagandises": "propagandizes",
+    "propagandising": "propagandizing",
+    "proselytise": "proselytize",
+    "proselytised": "proselytized",
+    "proselytiser": "proselytizer",
+    "proselytisers": "proselytizers",
+    "proselytises": "proselytizes",
+    "proselytising": "proselytizing",
+    "psychoanalyse": "psychoanalyze",
+    "psychoanalysed": "psychoanalyzed",
+    "psychoanalyses": "psychoanalyzes",
+    "psychoanalysing": "psychoanalyzing",
+    "publicise": "publicize",
+    "publicised": "publicized",
+    "publicises": "publicizes",
+    "publicising": "publicizing",
+    "pulverisation": "pulverization",
+    "pulverise": "pulverize",
+    "pulverised": "pulverized",
+    "pulverises": "pulverizes",
+    "pulverising": "pulverizing",
+    "pummelled": "pummel",
+    "pummelling": "pummeled",
+    "pyjama": "pajama",
+    "pyjamas": "pajamas",
+    "pzazz": "pizzazz",
+    "quarrelled": "quarreled",
+    "quarrelling": "quarreling",
+    "radicalise": "radicalize",
+    "radicalised": "radicalized",
+    "radicalises": "radicalizes",
+    "radicalising": "radicalizing",
+    "rancour": "rancor",
+    "randomise": "randomize",
+    "randomised": "randomized",
+    "randomises": "randomizes",
+    "randomising": "randomizing",
+    "rationalisation": "rationalization",
+    "rationalisations": "rationalizations",
+    "rationalise": "rationalize",
+    "rationalised": "rationalized",
+    "rationalises": "rationalizes",
+    "rationalising": "rationalizing",
+    "ravelled": "raveled",
+    "ravelling": "raveling",
+    "realisable": "realizable",
+    "realisation": "realization",
+    "realisations": "realizations",
+    "realise": "realize",
+    "realised": "realized",
+    "realises": "realizes",
+    "realising": "realizing",
+    "recognisable": "recognizable",
+    "recognisably": "recognizably",
+    "recognisance": "recognizance",
+    "recognise": "recognize",
+    "recognised": "recognized",
+    "recognises": "recognizes",
+    "recognising": "recognizing",
+    "reconnoitre": "reconnoiter",
+    "reconnoitred": "reconnoitered",
+    "reconnoitres": "reconnoiters",
+    "reconnoitring": "reconnoitering",
+    "refuelled": "refueled",
+    "refuelling": "refueling",
+    "regularisation": "regularization",
+    "regularise": "regularize",
+    "regularised": "regularized",
+    "regularises": "regularizes",
+    "regularising": "regularizing",
+    "remodelled": "remodeled",
+    "remodelling": "remodeling",
+    "remould": "remold",
+    "remoulded": "remolded",
+    "remoulding": "remolding",
+    "remoulds": "remolds",
+    "reorganisation": "reorganization",
+    "reorganisations": "reorganizations",
+    "reorganise": "reorganize",
+    "reorganised": "reorganized",
+    "reorganises": "reorganizes",
+    "reorganising": "reorganizing",
+    "revelled": "reveled",
+    "reveller": "reveler",
+    "revellers": "revelers",
+    "revelling": "reveling",
+    "revitalise": "revitalize",
+    "revitalised": "revitalized",
+    "revitalises": "revitalizes",
+    "revitalising": "revitalizing",
+    "revolutionise": "revolutionize",
+    "revolutionised": "revolutionized",
+    "revolutionises": "revolutionizes",
+    "revolutionising": "revolutionizing",
+    "rhapsodise": "rhapsodize",
+    "rhapsodised": "rhapsodized",
+    "rhapsodises": "rhapsodizes",
+    "rhapsodising": "rhapsodizing",
+    "rigour": "rigor",
+    "rigours": "rigors",
+    "ritualised": "ritualized",
+    "rivalled": "rivaled",
+    "rivalling": "rivaling",
+    "romanticise": "romanticize",
+    "romanticised": "romanticized",
+    "romanticises": "romanticizes",
+    "romanticising": "romanticizing",
+    "rumour": "rumor",
+    "rumoured": "rumored",
+    "rumours": "rumors",
+    "sabre": "saber",
+    "sabres": "sabers",
+    "saltpetre": "saltpeter",
+    "sanitise": "sanitize",
+    "sanitised": "sanitized",
+    "sanitises": "sanitizes",
+    "sanitising": "sanitizing",
+    "satirise": "satirize",
+    "satirised": "satirized",
+    "satirises": "satirizes",
+    "satirising": "satirizing",
+    "saviour": "savior",
+    "saviours": "saviors",
+    "savour": "savor",
+    "savoured": "savored",
+    "savouries": "savories",
+    "savouring": "savoring",
+    "savours": "savors",
+    "savoury": "savory",
+    "scandalise": "scandalize",
+    "scandalised": "scandalized",
+    "scandalises": "scandalizes",
+    "scandalising": "scandalizing",
+    "sceptic": "skeptic",
+    "sceptical": "skeptical",
+    "sceptically": "skeptically",
+    "scepticism": "skepticism",
+    "sceptics": "skeptics",
+    "sceptre": "scepter",
+    "sceptres": "scepters",
+    "scrutinise": "scrutinize",
+    "scrutinised": "scrutinized",
+    "scrutinises": "scrutinizes",
+    "scrutinising": "scrutinizing",
+    "secularisation": "secularization",
+    "secularise": "secularize",
+    "secularised": "secularized",
+    "secularises": "secularizes",
+    "secularising": "secularizing",
+    "sensationalise": "sensationalize",
+    "sensationalised": "sensationalized",
+    "sensationalises": "sensationalizes",
+    "sensationalising": "sensationalizing",
+    "sensitise": "sensitize",
+    "sensitised": "sensitized",
+    "sensitises": "sensitizes",
+    "sensitising": "sensitizing",
+    "sentimentalise": "sentimentalize",
+    "sentimentalised": "sentimentalized",
+    "sentimentalises": "sentimentalizes",
+    "sentimentalising": "sentimentalizing",
+    "sepulchre": "sepulcher",
+    "sepulchres": "sepulchers",
+    "serialisation": "serialization",
+    "serialisations": "serializations",
+    "serialise": "serialize",
+    "serialised": "serialized",
+    "serialises": "serializes",
+    "serialising": "serializing",
+    "sermonise": "sermonize",
+    "sermonised": "sermonized",
+    "sermonises": "sermonizes",
+    "sermonising": "sermonizing",
+    "sheikh": "sheik",
+    "shovelled": "shoveled",
+    "shovelling": "shoveling",
+    "shrivelled": "shriveled",
+    "shrivelling": "shriveling",
+    "signalise": "signalize",
+    "signalised": "signalized",
+    "signalises": "signalizes",
+    "signalising": "signalizing",
+    "signalled": "signaled",
+    "signalling": "signaling",
+    "smoulder": "smolder",
+    "smouldered": "smoldered",
+    "smouldering": "smoldering",
+    "smoulders": "smolders",
+    "snivelled": "sniveled",
+    "snivelling": "sniveling",
+    "snorkelled": "snorkeled",
+    "snorkelling": "snorkeling",
+    "snowplough": "snowplow",
+    "snowploughs": "snowplow",
+    "socialisation": "socialization",
+    "socialise": "socialize",
+    "socialised": "socialized",
+    "socialises": "socializes",
+    "socialising": "socializing",
+    "sodomise": "sodomize",
+    "sodomised": "sodomized",
+    "sodomises": "sodomizes",
+    "sodomising": "sodomizing",
+    "solemnise": "solemnize",
+    "solemnised": "solemnized",
+    "solemnises": "solemnizes",
+    "solemnising": "solemnizing",
+    "sombre": "somber",
+    "specialisation": "specialization",
+    "specialisations": "specializations",
+    "specialise": "specialize",
+    "specialised": "specialized",
+    "specialises": "specializes",
+    "specialising": "specializing",
+    "spectre": "specter",
+    "spectres": "specters",
+    "spiralled": "spiraled",
+    "spiralling": "spiraling",
+    "splendour": "splendor",
+    "splendours": "splendors",
+    "squirrelled": "squirreled",
+    "squirrelling": "squirreling",
+    "stabilisation": "stabilization",
+    "stabilise": "stabilize",
+    "stabilised": "stabilized",
+    "stabiliser": "stabilizer",
+    "stabilisers": "stabilizers",
+    "stabilises": "stabilizes",
+    "stabilising": "stabilizing",
+    "standardisation": "standardization",
+    "standardise": "standardize",
+    "standardised": "standardized",
+    "standardises": "standardizes",
+    "standardising": "standardizing",
+    "stencilled": "stenciled",
+    "stencilling": "stenciling",
+    "sterilisation": "sterilization",
+    "sterilisations": "sterilizations",
+    "sterilise": "sterilize",
+    "sterilised": "sterilized",
+    "steriliser": "sterilizer",
+    "sterilisers": "sterilizers",
+    "sterilises": "sterilizes",
+    "sterilising": "sterilizing",
+    "stigmatisation": "stigmatization",
+    "stigmatise": "stigmatize",
+    "stigmatised": "stigmatized",
+    "stigmatises": "stigmatizes",
+    "stigmatising": "stigmatizing",
+    "storey": "story",
+    "storeys": "stories",
+    "subsidisation": "subsidization",
+    "subsidise": "subsidize",
+    "subsidised": "subsidized",
+    "subsidiser": "subsidizer",
+    "subsidisers": "subsidizers",
+    "subsidises": "subsidizes",
+    "subsidising": "subsidizing",
+    "succour": "succor",
+    "succoured": "succored",
+    "succouring": "succoring",
+    "succours": "succors",
+    "sulphate": "sulfate",
+    "sulphates": "sulfates",
+    "sulphide": "sulfide",
+    "sulphides": "sulfides",
+    "sulphur": "sulfur",
+    "sulphurous": "sulfurous",
+    "summarise": "summarize",
+    "summarised": "summarized",
+    "summarises": "summarizes",
+    "summarising": "summarizing",
+    "swivelled": "swiveled",
+    "swivelling": "swiveling",
+    "symbolise": "symbolize",
+    "symbolised": "symbolized",
+    "symbolises": "symbolizes",
+    "symbolising": "symbolizing",
+    "sympathise": "sympathize",
+    "sympathised": "sympathized",
+    "sympathiser": "sympathizer",
+    "sympathisers": "sympathizers",
+    "sympathises": "sympathizes",
+    "sympathising": "sympathizing",
+    "synchronisation": "synchronization",
+    "synchronise": "synchronize",
+    "synchronised": "synchronized",
+    "synchronises": "synchronizes",
+    "synchronising": "synchronizing",
+    "synthesise": "synthesize",
+    "synthesised": "synthesized",
+    "synthesiser": "synthesizer",
+    "synthesisers": "synthesizers",
+    "synthesises": "synthesizes",
+    "synthesising": "synthesizing",
+    "syphon": "siphon",
+    "syphoned": "siphoned",
+    "syphoning": "siphoning",
+    "syphons": "siphons",
+    "systematisation": "systematization",
+    "systematise": "systematize",
+    "systematised": "systematized",
+    "systematises": "systematizes",
+    "systematising": "systematizing",
+    "tantalise": "tantalize",
+    "tantalised": "tantalized",
+    "tantalises": "tantalizes",
+    "tantalising": "tantalizing",
+    "tantalisingly": "tantalizingly",
+    "tasselled": "tasseled",
+    "technicolour": "technicolor",
+    "temporise": "temporize",
+    "temporised": "temporized",
+    "temporises": "temporizes",
+    "temporising": "temporizing",
+    "tenderise": "tenderize",
+    "tenderised": "tenderized",
+    "tenderises": "tenderizes",
+    "tenderising": "tenderizing",
+    "terrorise": "terrorize",
+    "terrorised": "terrorized",
+    "terrorises": "terrorizes",
+    "terrorising": "terrorizing",
+    "theatre": "theater",
+    "theatregoer": "theatergoer",
+    "theatregoers": "theatergoers",
+    "theatres": "theaters",
+    "theorise": "theorize",
+    "theorised": "theorized",
+    "theorises": "theorizes",
+    "theorising": "theorizing",
+    "tonne": "ton",
+    "tonnes": "tons",
+    "towelled": "toweled",
+    "towelling": "toweling",
+    "toxaemia": "toxemia",
+    "tranquillise": "tranquilize",
+    "tranquillised": "tranquilized",
+    "tranquilliser": "tranquilizer",
+    "tranquillisers": "tranquilizers",
+    "tranquillises": "tranquilizes",
+    "tranquillising": "tranquilizing",
+    "tranquillity": "tranquility",
+    "tranquillize": "tranquilize",
+    "tranquillized": "tranquilized",
+    "tranquillizer": "tranquilizer",
+    "tranquillizers": "tranquilizers",
+    "tranquillizes": "tranquilizes",
+    "tranquillizing": "tranquilizing",
+    "tranquilly": "tranquility",
+    "transistorised": "transistorized",
+    "traumatise": "traumatize",
+    "traumatised": "traumatized",
+    "traumatises": "traumatizes",
+    "traumatising": "traumatizing",
+    "travelled": "traveled",
+    "traveller": "traveler",
+    "travellers": "travelers",
+    "travelling": "traveling",
+    "travelog": "travelogue",
+    "travelogs": "travelogues",
+    "trialled": "trialed",
+    "trialling": "trialing",
+    "tricolour": "tricolor",
+    "tricolours": "tricolors",
+    "trivialise": "trivialize",
+    "trivialised": "trivialized",
+    "trivialises": "trivializes",
+    "trivialising": "trivializing",
+    "tumour": "tumor",
+    "tumours": "tumors",
+    "tunnelled": "tunneled",
+    "tunnelling": "tunneling",
+    "tyrannise": "tyrannize",
+    "tyrannised": "tyrannized",
+    "tyrannises": "tyrannizes",
+    "tyrannising": "tyrannizing",
+    "tyre": "tire",
+    "tyres": "tires",
+    "unauthorised": "unauthorized",
+    "uncivilised": "uncivilized",
+    "underutilised": "underutilized",
+    "unequalled": "unequaled",
+    "unfavourable": "unfavorable",
+    "unfavourably": "unfavorably",
+    "unionisation": "unionization",
+    "unionise": "unionize",
+    "unionised": "unionized",
+    "unionises": "unionizes",
+    "unionising": "unionizing",
+    "unorganised": "unorganized",
+    "unravelled": "unraveled",
+    "unravelling": "unraveling",
+    "unrecognisable": "unrecognizable",
+    "unrecognised": "unrecognized",
+    "unrivalled": "unrivaled",
+    "unsavoury": "unsavory",
+    "untrammelled": "untrammeled",
+    "urbanisation": "urbanization",
+    "urbanise": "urbanize",
+    "urbanised": "urbanized",
+    "urbanises": "urbanizes",
+    "urbanising": "urbanizing",
+    "utilisable": "utilizable",
+    "utilisation": "utilization",
+    "utilise": "utilize",
+    "utilised": "utilized",
+    "utilises": "utilizes",
+    "utilising": "utilizing",
+    "valour": "valor",
+    "vandalise": "vandalize",
+    "vandalised": "vandalized",
+    "vandalises": "vandalizes",
+    "vandalising": "vandalizing",
+    "vaporisation": "vaporization",
+    "vaporise": "vaporize",
+    "vaporised": "vaporized",
+    "vaporises": "vaporizes",
+    "vaporising": "vaporizing",
+    "vapour": "vapor",
+    "vapours": "vapors",
+    "verbalise": "verbalize",
+    "verbalised": "verbalized",
+    "verbalises": "verbalizes",
+    "verbalising": "verbalizing",
+    "victimisation": "victimization",
+    "victimise": "victimize",
+    "victimised": "victimized",
+    "victimises": "victimizes",
+    "victimising": "victimizing",
+    "videodisc": "videodisk",
+    "videodiscs": "videodisks",
+    "vigour": "vigor",
+    "visualisation": "visualization",
+    "visualisations": "visualizations",
+    "visualise": "visualize",
+    "visualised": "visualized",
+    "visualises": "visualizes",
+    "visualising": "visualizing",
+    "vocalisation": "vocalization",
+    "vocalisations": "vocalizations",
+    "vocalise": "vocalize",
+    "vocalised": "vocalized",
+    "vocalises": "vocalizes",
+    "vocalising": "vocalizing",
+    "vulcanised": "vulcanized",
+    "vulgarisation": "vulgarization",
+    "vulgarise": "vulgarize",
+    "vulgarised": "vulgarized",
+    "vulgarises": "vulgarizes",
+    "vulgarising": "vulgarizing",
+    "waggon": "wagon",
+    "waggons": "wagons",
+    "watercolour": "watercolor",
+    "watercolours": "watercolors",
+    "weaselled": "weaseled",
+    "weaselling": "weaseling",
+    "westernisation": "westernization",
+    "westernise": "westernize",
+    "westernised": "westernized",
+    "westernises": "westernizes",
+    "westernising": "westernizing",
+    "womanise": "womanize",
+    "womanised": "womanized",
+    "womaniser": "womanizer",
+    "womanisers": "womanizers",
+    "womanises": "womanizes",
+    "womanising": "womanizing",
+    "woollen": "woolen",
+    "woollens": "woolens",
+    "woollies": "woolies",
+    "woolly": "wooly",
+    "worshipped": "worshiped",
+    "worshipping": "worshiping",
+    "worshipper": "worshiper",
+    "yodelled": "yodeled",
+    "yodelling": "yodeling",
+    "yoghourt": "yogurt",
+    "yoghourts": "yogurts",
+    "yoghurt": "yogurt",
+    "yoghurts": "yogurts",
+    "mhm": "hmm",
+    "mm": "hmm",
+    "mmm": "hmm"
+}
\ No newline at end of file

From e6ee5b5070cfcbac048c661ead5dd174e51beea1 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 13:24:32 +0000
Subject: [PATCH 137/156] nits on documentation

---
 .../models/whisper/configuration_whisper.py   |  25 ++--
 .../whisper/convert_openai_whisper_to_tfms.py |  11 +-
 .../whisper/feature_extraction_whisper.py     |  23 ++-
 .../models/whisper/modeling_whisper.py        |  35 ++---
 .../models/whisper/tokenization_whisper.py    | 139 +-----------------
 5 files changed, 47 insertions(+), 186 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 438eec67c7257..d4242ab34818a 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -53,7 +53,7 @@
 
 class WhisperConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate an
+    This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
     Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
     with the defaults will yield a similar configuration to that of the Whisper
     [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.
@@ -65,10 +65,10 @@ class WhisperConfig(PretrainedConfig):
     Args:
         vocab_size (`int`, *optional*, defaults to 51865):
             Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`WhisperModel`]
+            `decoder_input_ids` passed when calling [`WhisperModel`]
         num_mel_bins (`int`, *optional*, defaults to 80):
             Number of mel features used per input features. Should correspond to the value used in the
-            `WhisperProcessor`` class.
+            `WhisperProcessor` class.
         encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
         decoder_layers (`int`, *optional*, defaults to 6):
@@ -77,10 +77,10 @@ class WhisperConfig(PretrainedConfig):
             Number of attention heads for each attention layer in the Transformer encoder.
         decoder_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 1536):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         encoder_ffn_dim (`int`, *optional*, defaults to 1536):
             Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         encoder_layerdrop (`float`, *optional*, defaults to 0.0):
             The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
             for more details.
@@ -89,16 +89,17 @@ class WhisperConfig(PretrainedConfig):
             for more details.
         decoder_start_token_id (`int`, *optional*, defaults to 50257):
             Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
-            are provided to the `generate`function
+            are provided to the `generate` function. It is used to guide the model`s generation process depending on
+            the task.
         use_cache (`bool`, *optional*, defaults to True):
             Whether or not the model should return the last key/values attentions (not used by all models).
         is_encoder_decoder (`bool`, *optional*, defaults to True):
-            _description_
+            Whether the model is used as an encoder/decoder or not.
         activation_function (`str`, *optional*, defaults to "gelu"):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
         d_model (`int`, *optional*, defaults to 256):
-            Dimensionality of the layers and the pooler layer.
+            Dimensionality of the layers.
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -108,7 +109,7 @@ class WhisperConfig(PretrainedConfig):
         init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         scale_embedding (`bool`, *optional*, defaults to False):
-            _description_
+            Scale embeddings by diving by sqrt(d_model).
         max_source_positions (`int`, *optional*, defaults to 1500):
             The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
         max_target_positions (`int`, *optional*, defaults to 448):
@@ -124,7 +125,8 @@ class WhisperConfig(PretrainedConfig):
             Whether to tie input and output embeddings.
         suppress_tokens (`List[int]`, *optional*, defaults to None):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
-            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI can be use here.
+            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
+            `multilingual` model.
         begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
             A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
             the token for `" "` (`blank_token_id`) and the `eos_token_id`
@@ -180,9 +182,6 @@ def __init__(
         begin_suppress_tokens=[220, 50256],
         **kwargs
     ):
-        """_summary_
-
-        Args:"""
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
         self.d_model = d_model
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index c7a4e143ad9d1..4163b81bf09e2 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -21,7 +21,6 @@
 import warnings
 
 import torch
-from torch import nn
 from tqdm import tqdm
 
 from transformers import WhisperConfig, WhisperForConditionalGeneration
@@ -72,13 +71,6 @@ def rename_keys(s_dict):
     return s_dict
 
 
-def make_linear_from_emb(emb):
-    vocab_size, emb_size = emb.weight.shape
-    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
-    lin_layer.weight.data = emb.weight.data
-    return lin_layer
-
-
 def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, checkpoint_path="weights"):
     full_path = os.path.join(os.getcwd(), checkpoint_path)
     if not os.path.isdir(os.path.join(full_path)):
@@ -140,6 +132,7 @@ def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, ch
 
 
 def _download(url: str, root: str) -> bytes:
+    # Copied from whisper's original codebase
     os.makedirs(root, exist_ok=True)
     filename = os.path.basename(url)
 
@@ -210,7 +203,7 @@ def convert_every_model(save_dir="whisper"):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    # # Required parameters
+    # Required parameters
     parser.add_argument("--original_name", type=str, help="Path to the fairseq model (.pt) file.")
     parser.add_argument(
         "--pytorch_dump_folder_path", default="whisper-converted", type=str, help="Path to the output PyTorch model."
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 2206e86573c18..ce5de7b65afa8 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -36,8 +36,8 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
     This feature extractor inherits from [`WhisperFeatureExtractor`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.
 
-    This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral
-    mean and variance normalization to the extracted features.
+    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
+    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
 
     Args:
         feature_size (`int`, defaults to 80):
@@ -51,7 +51,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
             sequences.
         n_fft (`int`, defaults to 400):
             Size of the Fourier transform.
-        padding_value (`float`, defaults to 0.0):
+        padding_value (`float`, *optional*, defaults to 0.0):
             Padding value used to pad the audio. Should correspond to silences.
     """
 
@@ -127,6 +127,13 @@ def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
         return weights
 
     def fram_wave(self, waveform, center=True):
+        """
+        Transform a raw waveform into a list of smaller waveforms. The window length defines how much of the signal is
+        contain in each frame (smalle waveform), while the hope length defines the step between the beginning of each
+        new frame.
+
+        Centering is done by reflecting the waveform which is first centered around `frame_idx * hop_length`.
+        """
         frames = []
         for i in range(0, waveform.shape[0] + 1, self.hop_length):
             half_window = (self.n_fft - 1) // 2 + 1
@@ -158,7 +165,7 @@ def fram_wave(self, waveform, center=True):
     def stft(self, frames, window):
         """
         Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same
-        results as torch.stft
+        results as `torch.stft`.
         """
         frame_size = frames.shape[1]
         fft_size = self.n_fft
@@ -184,8 +191,8 @@ def stft(self, frames, window):
 
     def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
         """
-        Compute the log-Mel spectrogram of the provided audio, gives similar results to a torch implementation at 1e-5
-        tolerance.
+        Compute the log-Mel spectrogram of the provided audio, gives similar results whisper's original torch
+        implementation with 1e-5 tolerance.
         """
         window = np.hanning(self.n_fft + 1)[:-1]
 
@@ -220,9 +227,9 @@ def __call__(
             raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrays or a list of list of float values.
-            truncation (`bool`):
+            truncation (`bool`, *optional*, default to `True`):
                 Activates truncation to cut input sequences longer than *max_length* to *max_length*.
-            pad_to_multiple_of (`int`, *optional*):
+            pad_to_multiple_of (`int`, *optional*, defaults to None):
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 0d5c9e9b5ab83..ef23914b8ce77 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -487,21 +487,21 @@ def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
 
 WHISPER_INPUTS_DOCSTRING = r"""
     Args:
-        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
-            Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
-            by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
-            via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
-            [`WhisperFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
+        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
             tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Indices of decoder input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`SpeechToTextTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
 
             [What are decoder input IDs?](../glossary#decoder-input-ids)
 
-            SpeechToText uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            Whisper uses the `decoder_start_token_id` as the starting token for `decoder_input_ids` generation. If
             `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
         decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -609,11 +609,11 @@ def forward(
     ):
         r"""
         Args:
-            input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`):
-                Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                 obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                 `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
-                `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting the fbank features,
+                `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting the mel features,
                 padding and conversion into a tensor of type `torch.FloatTensor`. See
                 [`~WhisperFeatureExtractor.__call__`]
             head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
@@ -1012,9 +1012,7 @@ def forward(
          >>> model = WhisperModel.from_pretrained("openai/whisper-base")
          >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
          >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-         >>> inputs = feature_extractor(
-         ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
-         ... )
+         >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
          >>> input_features = inputs.input_features
          >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
          >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
@@ -1045,9 +1043,6 @@ def forward(
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
 
-        # downsample encoder attention mask
-        # decoder_attention_mask = None
-
         # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
@@ -1156,16 +1151,14 @@ def forward(
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 
-        >>> inputs = processor(
-        ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
-        ... )
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
         >>> input_features = inputs.input_features
 
         >>> generated_ids = model.generate(inputs=input_features)
 
         >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         >>> transcription
-        ' The quilter is the apostle of the middle classes and we are glad to welcome his'
+        ' Mr. Quilter is the apostle of the middle classes, and we are glad to'
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index eae091538dff8..0ea2ada8e4e25 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -40,126 +40,6 @@
 }
 
 
-LANGUAGES = {
-    "en": "english",
-    "zh": "chinese",
-    "de": "german",
-    "es": "spanish",
-    "ru": "russian",
-    "ko": "korean",
-    "fr": "french",
-    "ja": "japanese",
-    "pt": "portuguese",
-    "tr": "turkish",
-    "pl": "polish",
-    "ca": "catalan",
-    "nl": "dutch",
-    "ar": "arabic",
-    "sv": "swedish",
-    "it": "italian",
-    "id": "indonesian",
-    "hi": "hindi",
-    "fi": "finnish",
-    "vi": "vietnamese",
-    "iw": "hebrew",
-    "uk": "ukrainian",
-    "el": "greek",
-    "ms": "malay",
-    "cs": "czech",
-    "ro": "romanian",
-    "da": "danish",
-    "hu": "hungarian",
-    "ta": "tamil",
-    "no": "norwegian",
-    "th": "thai",
-    "ur": "urdu",
-    "hr": "croatian",
-    "bg": "bulgarian",
-    "lt": "lithuanian",
-    "la": "latin",
-    "mi": "maori",
-    "ml": "malayalam",
-    "cy": "welsh",
-    "sk": "slovak",
-    "te": "telugu",
-    "fa": "persian",
-    "lv": "latvian",
-    "bn": "bengali",
-    "sr": "serbian",
-    "az": "azerbaijani",
-    "sl": "slovenian",
-    "kn": "kannada",
-    "et": "estonian",
-    "mk": "macedonian",
-    "br": "breton",
-    "eu": "basque",
-    "is": "icelandic",
-    "hy": "armenian",
-    "ne": "nepali",
-    "mn": "mongolian",
-    "bs": "bosnian",
-    "kk": "kazakh",
-    "sq": "albanian",
-    "sw": "swahili",
-    "gl": "galician",
-    "mr": "marathi",
-    "pa": "punjabi",
-    "si": "sinhala",
-    "km": "khmer",
-    "sn": "shona",
-    "yo": "yoruba",
-    "so": "somali",
-    "af": "afrikaans",
-    "oc": "occitan",
-    "ka": "georgian",
-    "be": "belarusian",
-    "tg": "tajik",
-    "sd": "sindhi",
-    "gu": "gujarati",
-    "am": "amharic",
-    "yi": "yiddish",
-    "lo": "lao",
-    "uz": "uzbek",
-    "fo": "faroese",
-    "ht": "haitian creole",
-    "ps": "pashto",
-    "tk": "turkmen",
-    "nn": "nynorsk",
-    "mt": "maltese",
-    "sa": "sanskrit",
-    "lb": "luxembourgish",
-    "my": "myanmar",
-    "bo": "tibetan",
-    "tl": "tagalog",
-    "mg": "malagasy",
-    "as": "assamese",
-    "tt": "tatar",
-    "haw": "hawaiian",
-    "ln": "lingala",
-    "ha": "hausa",
-    "ba": "bashkir",
-    "jw": "javanese",
-    "su": "sundanese",
-}
-
-
-# language code lookup by name, with a few language aliases
-TO_LANGUAGE_CODE = {
-    **{language: code for code, language in LANGUAGES.items()},
-    "burmese": "my",
-    "valencian": "ca",
-    "flemish": "nl",
-    "haitian": "ht",
-    "letzeburgesch": "lb",
-    "pushto": "ps",
-    "panjabi": "pa",
-    "moldavian": "ro",
-    "moldovan": "ro",
-    "sinhalese": "si",
-    "castilian": "es",
-}
-
-
 # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
 def bytes_to_unicode():
     """
@@ -290,21 +170,6 @@ def get_vocab(self):
         vocab.update(self.added_tokens_encoder)
         return vocab
 
-    @property
-    def all_language_tokens(self) -> Tuple[int]:
-        result = []
-        for token, token_id in zip(
-            self.additional_special_tokens,
-            self.additional_special_tokens_ids,
-        ):
-            if token.strip("<|>") in LANGUAGES:
-                result.append(token_id)
-        return tuple(result)
-
-    @property
-    def all_language_codes(self) -> Tuple[str]:
-        return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens)
-
     @property
     def vocab_size(self) -> int:
         return len(self.encoder)
@@ -420,6 +285,10 @@ def _convert_id_to_token(self, index):
         return self.decoder.get(index, self.decoder.get(self.unk_token_id))
 
     def _normalize(self, text):
+        """
+        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
+        english text.
+        """
         normalizer = EnglishTextNormalizer()
         return normalizer(text)
 

From a6361d447f0d66aa8a1ec81e8721b7376b776846 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 13:29:24 +0000
Subject: [PATCH 138/156] nit

---
 .../models/whisper/convert_openai_whisper_to_tfms.py        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
index 4163b81bf09e2..e5d0d561e6477 100644
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
@@ -61,9 +61,9 @@ def rename_keys(s_dict):
     keys = list(s_dict.keys())
     for key in keys:
         new_key = key
-        for k, v in WHISPER_MAPPING.items():
-            if k in key:
-                new_key = new_key.replace(k, v)
+        for old_key, new_key in WHISPER_MAPPING.items():
+            if old_key in key:
+                new_key = new_key.replace(old_key, new_key)
 
         print(f"{key} -> {new_key}")
 

From f92b9a8181f9a84114becd31a5a4210723cdf1ad Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 13:54:27 +0000
Subject: [PATCH 139/156] nits

---
 src/transformers/generation_logits_process.py |   7 +-
 src/transformers/generation_utils.py          |   3 +
 .../whisper/convert_openai_whisper_to_tfms.py | 213 ------------------
 3 files changed, 6 insertions(+), 217 deletions(-)
 delete mode 100644 src/transformers/models/whisper/convert_openai_whisper_to_tfms.py

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 21c180312cade..ddfe5aa5ef3ac 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -706,10 +706,9 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
 
 class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
     r"""
-    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function's starts
-    generating using `begin_index` tokens. This usually happens when `use_cache` us set to `True`, and the
-    `decoder_input_ids` only include the previously generated token along with and `eos_token` if `forced_eos_token` is
-    used.
+    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
+    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not 
+    sampled at the begining of the generation. 
     """
 
     def __init__(self, begin_suppress_tokens, begin_index):
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index e4c87268e5899..9652146650961 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1110,6 +1110,9 @@ def generate(
             suppress_tokens  (`List[int]`, *optional*, defaults to `model.config.suppress_tokens`):
                 A list of tokens that will be supressed at generation. The `SupressTokens` logit processor will set
                 their log probs to `-inf` so that they are not sampled.
+            begin_suppress_tokens  (`List[int]`, *optional*, defaults to `model.config.begin_suppress_tokens`):
+                A list of tokens that will be supressed at the begining of the generation. The `SupressBeginTokens` logit processor will set
+                their log probs to `-inf` so that they are not sampled.
 
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If the model
diff --git a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py b/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
deleted file mode 100644
index e5d0d561e6477..0000000000000
--- a/src/transformers/models/whisper/convert_openai_whisper_to_tfms.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert Whisper checkpoints from the original repository. URL: https://github.com/openai/whisper"""
-
-import argparse
-import hashlib
-import io
-import os
-import urllib
-import warnings
-
-import torch
-from tqdm import tqdm
-
-from transformers import WhisperConfig, WhisperForConditionalGeneration
-
-
-def remove_ignore_keys_(state_dict):
-    ignore_keys = ["layers", "blocks"]
-    for k in ignore_keys:
-        state_dict.pop(k, None)
-
-
-WHISPER_MAPPING = {
-    "blocks": "layers",
-    "mlp.0": "fc1",
-    "mlp.2": "fc2",
-    "mlp_ln": "final_layer_norm",
-    "blocks": "layers",
-    ".attn.query": ".self_attn.q_proj",
-    ".attn.key": ".self_attn.k_proj",
-    ".attn.value": ".self_attn.v_proj",
-    ".attn_ln": ".self_attn_layer_norm",
-    ".attn.out": ".self_attn.out_proj",
-    ".cross_attn.query": ".encoder_attn.q_proj",
-    ".cross_attn.key": ".encoder_attn.k_proj",
-    ".cross_attn.value": ".encoder_attn.v_proj",
-    ".cross_attn_ln": ".encoder_attn_layer_norm",
-    ".cross_attn.out": ".encoder_attn.out_proj",
-    "decoder.ln.": "decoder.layer_norm.",
-    "encoder.ln.": "encoder.layer_norm.",
-    "token_embedding": "embed_tokens",
-    "encoder.positional_embedding": "encoder.embed_positions.weight",
-    "decoder.positional_embedding": "decoder.embed_positions.weight",
-    "ln_post": "layer_norm",
-}
-
-
-def rename_keys(s_dict):
-    keys = list(s_dict.keys())
-    for key in keys:
-        new_key = key
-        for old_key, new_key in WHISPER_MAPPING.items():
-            if old_key in key:
-                new_key = new_key.replace(old_key, new_key)
-
-        print(f"{key} -> {new_key}")
-
-        s_dict[new_key] = s_dict.pop(key)
-    return s_dict
-
-
-def convert_openai_whisper_to_tfms(checkpoint_name, pytorch_dump_folder_path, checkpoint_path="weights"):
-    full_path = os.path.join(os.getcwd(), checkpoint_path)
-    if not os.path.isdir(os.path.join(full_path)):
-        os.makedirs(full_path, exist_ok=True)
-        try:
-            _, checkpoint_path = _download(_MODELS[checkpoint_name], full_path)
-        except KeyError:
-            print("The original checkpoint should be in _MODELS ")
-
-    print(f"Loading model from : {full_path}/{checkpoint_name}")
-    original_checkpoint = torch.load(os.path.join(full_path, checkpoint_name) + ".pt", map_location="cpu")
-    dimensions = original_checkpoint["dims"]
-    state_dict = original_checkpoint["model_state_dict"]
-
-    remove_ignore_keys_(state_dict)
-    rename_keys(state_dict)
-
-    config = WhisperConfig(
-        vocab_size=dimensions["n_vocab"],
-        num_mel_bins=dimensions["n_mels"],
-        d_model=dimensions["n_audio_state"],
-        max_target_positions=dimensions["n_text_ctx"],
-        encoder_layers=dimensions["n_audio_layer"],
-        encoder_attention_heads=dimensions["n_audio_head"],
-        decoder_layers=dimensions["n_text_layer"],
-        decoder_attention_heads=dimensions["n_text_head"],
-        max_source_positions=dimensions["n_audio_ctx"],
-        decoder_ffn_dim=4 * dimensions["n_audio_state"],
-        encoder_ffn_dim=4 * dimensions["n_audio_state"],
-    )
-
-    model = WhisperForConditionalGeneration(config)
-    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
-    if len(missing) > 0 and not set(missing) <= set(
-        [
-            "encoder.embed_positions.weights",
-            "decoder.embed_positions.weights",
-        ]
-    ):
-        raise ValueError(
-            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
-            f" but all the following weights are missing {missing}"
-        )
-
-    model.save_pretrained(os.path.join(pytorch_dump_folder_path, checkpoint_name))
-
-
-_MODELS = {
-    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
-    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
-    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
-    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
-    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
-    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
-    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
-    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
-    "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
-}
-
-
-def _download(url: str, root: str) -> bytes:
-    # Copied from whisper's original codebase
-    os.makedirs(root, exist_ok=True)
-    filename = os.path.basename(url)
-
-    expected_sha256 = url.split("/")[-2]
-    download_target = os.path.join(root, filename)
-
-    if os.path.exists(download_target) and not os.path.isfile(download_target):
-        raise RuntimeError(f"{download_target} exists and is not a regular file")
-
-    if os.path.isfile(download_target):
-        model_bytes = open(download_target, "rb").read()
-        if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
-            return None, download_target
-        else:
-            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
-
-    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
-        with tqdm(
-            total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
-        ) as loop:
-            while True:
-                buffer = source.read(8192)
-                if not buffer:
-                    break
-
-                output.write(buffer)
-                loop.update(len(buffer))
-
-    model_bytes = open(download_target, "rb").read()
-    if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
-        raise RuntimeError(
-            "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
-        )
-
-    return model_bytes, download_target
-
-
-def convert_every_model(save_dir="whisper"):
-    layers = [4, 6, 12, 24, 32, 4, 6, 12, 24]
-    width = [384, 512, 768, 1024, 1280, 384, 512, 768, 1024]
-    heads = [6, 8, 12, 16, 20, 6, 8, 12, 16]
-    name = ["tiny", "base", "small", "medium", "large", "tiny.en", "base.en", "small.en", "medium.en"]
-    for l, w, h, n in zip(layers, width, heads, name):
-
-        config = WhisperConfig(
-            vocab_size=51865,
-            encoder_layers=l,
-            encoder_attention_heads=h,
-            decoder_attention_heads=h,
-            decoder_layers=l,
-            d_model=w,
-            decoder_ffn_dim=4 * w,
-            encoder_ffn_dim=4 * w,
-        )
-        model = WhisperForConditionalGeneration(config)
-
-        model_bytes, _ = _download(_MODELS[n], "original-weights")
-        with io.BytesIO(model_bytes) as fp:
-            original = torch.load(fp, map_location="cpu")["model_state_dict"]
-
-        new = rename_keys(original.copy())
-
-        missing, unexpected = model.load_state_dict(new, strict=False)
-        if missing == []:
-            print("succesfully loaded")
-            model.save_pretrained(f"{save_dir}/{n}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument("--original_name", type=str, help="Path to the fairseq model (.pt) file.")
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default="whisper-converted", type=str, help="Path to the output PyTorch model."
-    )
-    args = parser.parse_args()
-
-    convert_openai_whisper_to_tfms(args.original_name, args.pytorch_dump_folder_path)

From 8d40196681cba841d00d1aa59088528cabb126ef Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 14:00:54 +0000
Subject: [PATCH 140/156] last styling

---
 src/transformers/generation_logits_process.py | 4 ++--
 src/transformers/generation_utils.py          | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index ddfe5aa5ef3ac..9bbd94bb4e972 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -707,8 +707,8 @@ def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tenso
 class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
     r"""
     [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
-    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not 
-    sampled at the begining of the generation. 
+    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not
+    sampled at the begining of the generation.
     """
 
     def __init__(self, begin_suppress_tokens, begin_index):
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 9652146650961..d6d6fda1ede78 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1111,8 +1111,8 @@ def generate(
                 A list of tokens that will be supressed at generation. The `SupressTokens` logit processor will set
                 their log probs to `-inf` so that they are not sampled.
             begin_suppress_tokens  (`List[int]`, *optional*, defaults to `model.config.begin_suppress_tokens`):
-                A list of tokens that will be supressed at the begining of the generation. The `SupressBeginTokens` logit processor will set
-                their log probs to `-inf` so that they are not sampled.
+                A list of tokens that will be supressed at the begining of the generation. The `SupressBeginTokens`
+                logit processor will set their log probs to `-inf` so that they are not sampled.
 
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If the model

From 392563e9c6fc90f4754662cd7139cfa4747d2ea3 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 14:45:35 +0000
Subject: [PATCH 141/156] add main toctree file

---
 docs/source/en/_toctree.yml | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index e96c26aeb07ca..7b83bafbd867e 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -31,7 +31,7 @@
     - local: sagemaker
       title: Run training on Amazon SageMaker
     - local: converting_tensorflow_models
-      title: Converting TensorFlow Checkpoints
+      title: Converting from TensorFlow checkpoints
     - local: serialization
       title: Export 🤗 Transformers models
     - local: troubleshooting
@@ -42,8 +42,7 @@
       title: Use tokenizers from 🤗 Tokenizers
     - local: multilingual
       title: Inference for multilingual models
-    - isExpanded: false
-      sections:
+    - sections:
       - local: tasks/sequence_classification
         title: Text classification
       - local: tasks/token_classification
@@ -59,6 +58,7 @@
       - local: tasks/multiple_choice
         title: Multiple choice
       title: Task guides
+      isExpanded: false
     title: Natural Language Processing
   - sections:
     - local: tasks/audio_classification
@@ -109,6 +109,8 @@
       title: How to contribute to transformers?
     - local: add_new_model
       title: How to add a model to 🤗 Transformers?
+    - local: add_tensorflow_model
+      title: How to convert a 🤗 Transformers model to TensorFlow?
     - local: add_new_pipeline
       title: How to add a pipeline to 🤗 Transformers?
     - local: testing
@@ -241,6 +243,8 @@
         title: Encoder Decoder Models
       - local: model_doc/ernie
         title: ERNIE
+      - local: model_doc/esm
+        title: ESM
       - local: model_doc/flaubert
         title: FlauBERT
       - local: model_doc/fnet
@@ -279,6 +283,8 @@
         title: M2M100
       - local: model_doc/marian
         title: MarianMT
+      - local: model_doc/markuplm
+        title: MarkupLM
       - local: model_doc/mbart
         title: MBart and MBart-50
       - local: model_doc/megatron-bert
@@ -494,6 +500,11 @@
       - local: model_doc/trajectory_transformer
         title: Trajectory Transformer
       title: Reinforcement learning models
+    - isExpanded: false
+      sections:
+      - local: model_doc/time_series_transformer
+        title: Time Series Transformer
+      title: Time series models
     title: Models
   - sections:
     - local: internal/modeling_utils
@@ -509,4 +520,4 @@
     - local: internal/file_utils
       title: General Utilities
     title: Internal Helpers
-  title: API
+  title: API
\ No newline at end of file

From 009cdefa7a890093b650972f189302245d20e10a Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 19:05:18 +0000
Subject: [PATCH 142/156] remove sentence piece dependency

---
 src/transformers/__init__.py                        | 13 ++++++++-----
 .../utils/dummy_sentencepiece_and_speech_objects.py |  7 -------
 .../utils/dummy_sentencepiece_objects.py            |  7 -------
 3 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index af4786793795d..a0c039b7e80fa 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -393,6 +393,8 @@
     "models.whisper": [
         "WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "WhisperConfig",
+        "WhisperFeatureExtractor",
+        "WhisperProcessor",
     ],
     "models.x_clip": [
         "XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -638,7 +640,6 @@
 else:
     _import_structure["models.mctct"].append("MCTCTFeatureExtractor")
     _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
-    _import_structure["models.whisper"].append("WhisperFeatureExtractor")
 
 # Tensorflow-text-specific objects
 try:
@@ -664,7 +665,6 @@
     ]
 else:
     _import_structure["models.speech_to_text"].append("Speech2TextProcessor")
-    _import_structure["models.whisper"].append("WhisperProcessor")
 
 # Vision-specific objects
 try:
@@ -3352,7 +3352,12 @@
     from .models.wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizer
     from .models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
     from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
-    from .models.whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig
+    from .models.whisper import (
+        WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        WhisperConfig,
+        WhisperFeatureExtractor,
+        WhisperProcessor,
+    )
     from .models.x_clip import (
         XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         XCLIPConfig,
@@ -3578,7 +3583,6 @@
     else:
         from .models.mctct import MCTCTFeatureExtractor
         from .models.speech_to_text import Speech2TextFeatureExtractor
-        from .models.whisper import WhisperFeatureExtractor
 
     try:
         if not is_tensorflow_text_available():
@@ -3595,7 +3599,6 @@
         from .utils.dummy_sentencepiece_and_speech_objects import *
     else:
         from .models.speech_to_text import Speech2TextProcessor
-        from .models.whisper import WhisperProcessor
 
     try:
         if not is_vision_available():
diff --git a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
index 9e3cdf54ab76b..b9b971f1f15c7 100644
--- a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
@@ -8,10 +8,3 @@ class Speech2TextProcessor(metaclass=DummyObject):
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece", "speech"])
-
-
-class WhisperProcessor(metaclass=DummyObject):
-    _backends = ["sentencepiece", "speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece", "speech"])
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index 633cfd3e031d0..69f0bdcb7b1aa 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -164,13 +164,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["sentencepiece"])
 
 
-class WhisperTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
 class XGLMTokenizer(metaclass=DummyObject):
     _backends = ["sentencepiece"]
 

From ef83269512986ef8bb4f8e9f06535bb70d71e4f4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 19:11:59 +0000
Subject: [PATCH 143/156] clean init file

---
 src/transformers/models/whisper/__init__.py   | 56 ++++---------------
 .../models/whisper/tokenization_whisper.py    |  2 -
 2 files changed, 11 insertions(+), 47 deletions(-)

diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
index 2de334e5cbc07..be735594e69dc 100644
--- a/src/transformers/models/whisper/__init__.py
+++ b/src/transformers/models/whisper/__init__.py
@@ -17,37 +17,19 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_sentencepiece_available,
-    is_speech_available,
-    is_torch_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
-    "configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig"],
+    "configuration_whisper": [
+        "WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "WhisperConfig",
+        "WhisperFeatureExtractor",
+        "WhisperProcessor",
+        "WhisperTokenizer",
+    ],
 }
 
-try:
-    if not is_sentencepiece_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tokenization_whisper"] = ["WhisperTokenizer"]
-
-try:
-    if not is_speech_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["feature_extraction_whisper"] = ["WhisperFeatureExtractor"]
-
-    if is_sentencepiece_available():
-        _import_structure["processing_whisper"] = ["WhisperProcessor"]
 
 try:
     if not is_torch_available():
@@ -65,25 +47,9 @@
 
 if TYPE_CHECKING:
     from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig
-
-    try:
-        if not is_sentencepiece_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tokenization_whisper import WhisperTokenizer
-
-    try:
-        if not is_speech_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .feature_extraction_whisper import WhisperFeatureExtractor
-
-        if is_sentencepiece_available():
-            from .processing_whisper import WhisperProcessor
+    from .feature_extraction_whisper import WhisperFeatureExtractor
+    from .processing_whisper import WhisperProcessor
+    from .tokenization_whisper import WhisperTokenizer
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 0ea2ada8e4e25..dc9b45d974850 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -24,8 +24,6 @@
 from .english_normalizer import EnglishTextNormalizer
 
 
-SPIECE_UNDERLINE = "▁"
-
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json", "merges_file": "merges.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {

From 78d1ed29e574552d6a56665fe216da4e9f5bb494 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 19:16:35 +0000
Subject: [PATCH 144/156] fix tokenizer that has no dependencies on
 sentencepiece

---
 src/transformers/__init__.py                   | 4 ++--
 src/transformers/utils/dummy_speech_objects.py | 7 -------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index a0c039b7e80fa..2a2ab7d88cf72 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -395,6 +395,7 @@
         "WhisperConfig",
         "WhisperFeatureExtractor",
         "WhisperProcessor",
+        "WhisperTokenizer",
     ],
     "models.x_clip": [
         "XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -538,7 +539,6 @@
     _import_structure["models.reformer"].append("ReformerTokenizer")
     _import_structure["models.rembert"].append("RemBertTokenizer")
     _import_structure["models.speech_to_text"].append("Speech2TextTokenizer")
-    _import_structure["models.whisper"].append("WhisperTokenizer")
     _import_structure["models.t5"].append("T5Tokenizer")
     _import_structure["models.xglm"].append("XGLMTokenizer")
     _import_structure["models.xlm_prophetnet"].append("XLMProphetNetTokenizer")
@@ -3357,6 +3357,7 @@
         WhisperConfig,
         WhisperFeatureExtractor,
         WhisperProcessor,
+        WhisperTokenizer,
     )
     from .models.x_clip import (
         XCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -3499,7 +3500,6 @@
         from .models.rembert import RemBertTokenizer
         from .models.speech_to_text import Speech2TextTokenizer
         from .models.t5 import T5Tokenizer
-        from .models.whisper import WhisperTokenizer
         from .models.xglm import XGLMTokenizer
         from .models.xlm_prophetnet import XLMProphetNetTokenizer
         from .models.xlm_roberta import XLMRobertaTokenizer
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index a2b4a40961b53..ae5589292a4cf 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -15,10 +15,3 @@ class Speech2TextFeatureExtractor(metaclass=DummyObject):
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["speech"])
-
-
-class WhisperFeatureExtractor(metaclass=DummyObject):
-    _backends = ["speech"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["speech"])

From f572f5f9b8a8022d3ceaae6942d270fc5adab456 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Mon, 3 Oct 2022 19:34:02 +0000
Subject: [PATCH 145/156] update whisper init file, nit

---
 src/transformers/models/whisper/__init__.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
index be735594e69dc..ea7259cf69c41 100644
--- a/src/transformers/models/whisper/__init__.py
+++ b/src/transformers/models/whisper/__init__.py
@@ -21,13 +21,10 @@
 
 
 _import_structure = {
-    "configuration_whisper": [
-        "WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "WhisperConfig",
-        "WhisperFeatureExtractor",
-        "WhisperProcessor",
-        "WhisperTokenizer",
-    ],
+    "configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig"],
+    "feature_extraction_whisper": ["WhisperFeatureExtractor"],
+    "processing_whisper": ["WhisperProcessor"],
+    "tokenization_whisper": ["WhisperTokenizer"],
 }
 
 

From 837a41070ddaf5794452fea9ef158c2de32093d6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 4 Oct 2022 14:04:38 +0000
Subject: [PATCH 146/156] remove english.json file

---
 src/transformers/models/whisper/english.json  | 1742 ----------------
 .../models/whisper/english_normalizer.py      | 1751 ++++++++++++++++-
 2 files changed, 1746 insertions(+), 1747 deletions(-)
 delete mode 100644 src/transformers/models/whisper/english.json

diff --git a/src/transformers/models/whisper/english.json b/src/transformers/models/whisper/english.json
deleted file mode 100644
index bd84ae73faeb4..0000000000000
--- a/src/transformers/models/whisper/english.json
+++ /dev/null
@@ -1,1742 +0,0 @@
-{
-    "accessorise": "accessorize",
-    "accessorised": "accessorized",
-    "accessorises": "accessorizes",
-    "accessorising": "accessorizing",
-    "acclimatisation": "acclimatization",
-    "acclimatise": "acclimatize",
-    "acclimatised": "acclimatized",
-    "acclimatises": "acclimatizes",
-    "acclimatising": "acclimatizing",
-    "accoutrements": "accouterments",
-    "aeon": "eon",
-    "aeons": "eons",
-    "aerogramme": "aerogram",
-    "aerogrammes": "aerograms",
-    "aeroplane": "airplane",
-    "aeroplanes": "airplanes",
-    "aesthete": "esthete",
-    "aesthetes": "esthetes",
-    "aesthetic": "esthetic",
-    "aesthetically": "esthetically",
-    "aesthetics": "esthetics",
-    "aetiology": "etiology",
-    "ageing": "aging",
-    "aggrandisement": "aggrandizement",
-    "agonise": "agonize",
-    "agonised": "agonized",
-    "agonises": "agonizes",
-    "agonising": "agonizing",
-    "agonisingly": "agonizingly",
-    "almanack": "almanac",
-    "almanacks": "almanacs",
-    "aluminium": "aluminum",
-    "amortisable": "amortizable",
-    "amortisation": "amortization",
-    "amortisations": "amortizations",
-    "amortise": "amortize",
-    "amortised": "amortized",
-    "amortises": "amortizes",
-    "amortising": "amortizing",
-    "amphitheatre": "amphitheater",
-    "amphitheatres": "amphitheaters",
-    "anaemia": "anemia",
-    "anaemic": "anemic",
-    "anaesthesia": "anesthesia",
-    "anaesthetic": "anesthetic",
-    "anaesthetics": "anesthetics",
-    "anaesthetise": "anesthetize",
-    "anaesthetised": "anesthetized",
-    "anaesthetises": "anesthetizes",
-    "anaesthetising": "anesthetizing",
-    "anaesthetist": "anesthetist",
-    "anaesthetists": "anesthetists",
-    "anaesthetize": "anesthetize",
-    "anaesthetized": "anesthetized",
-    "anaesthetizes": "anesthetizes",
-    "anaesthetizing": "anesthetizing",
-    "analogue": "analog",
-    "analogues": "analogs",
-    "analyse": "analyze",
-    "analysed": "analyzed",
-    "analyses": "analyzes",
-    "analysing": "analyzing",
-    "anglicise": "anglicize",
-    "anglicised": "anglicized",
-    "anglicises": "anglicizes",
-    "anglicising": "anglicizing",
-    "annualised": "annualized",
-    "antagonise": "antagonize",
-    "antagonised": "antagonized",
-    "antagonises": "antagonizes",
-    "antagonising": "antagonizing",
-    "apologise": "apologize",
-    "apologised": "apologized",
-    "apologises": "apologizes",
-    "apologising": "apologizing",
-    "appal": "appall",
-    "appals": "appalls",
-    "appetiser": "appetizer",
-    "appetisers": "appetizers",
-    "appetising": "appetizing",
-    "appetisingly": "appetizingly",
-    "arbour": "arbor",
-    "arbours": "arbors",
-    "archeological": "archaeological",
-    "archaeologically": "archeologically",
-    "archaeologist": "archeologist",
-    "archaeologists": "archeologists",
-    "archaeology": "archeology</span>",
-    "ardour": "ardor",
-    "armour": "armor",
-    "armoured": "armored",
-    "armourer": "armorer",
-    "armourers": "armorers",
-    "armouries": "armories",
-    "armoury": "armory",
-    "artefact": "artifact",
-    "artefacts": "artifacts",
-    "authorise": "authorize",
-    "authorised": "authorized",
-    "authorises": "authorizes",
-    "authorising": "authorizing",
-    "axe": "ax",
-    "backpedalled": "backpedaled",
-    "backpedalling": "backpedaling",
-    "bannister": "banister",
-    "bannisters": "banisters",
-    "baptise": "baptize",
-    "baptised": "baptized",
-    "baptises": "baptizes",
-    "baptising": "baptizing",
-    "bastardise": "bastardize",
-    "bastardised": "bastardized",
-    "bastardises": "bastardizes",
-    "bastardising": "bastardizing",
-    "battleax": "battleaxe",
-    "baulk": "balk",
-    "baulked": "balked",
-    "baulking": "balking",
-    "baulks": "balks",
-    "bedevilled": "bedeviled",
-    "bedevilling": "bedeviling",
-    "behaviour": "behavior",
-    "behavioural": "behavioral",
-    "behaviourism": "behaviorism",
-    "behaviourist": "behaviorist",
-    "behaviourists": "behaviorists",
-    "behaviours": "behaviors",
-    "behove": "behoove",
-    "behoved": "behooved",
-    "behoves": "behooves",
-    "bejewelled": "bejeweled",
-    "belabour": "belabor",
-    "belaboured": "belabored",
-    "belabouring": "belaboring",
-    "belabours": "belabors",
-    "bevelled": "beveled",
-    "bevvies": "bevies",
-    "bevvy": "bevy",
-    "biassed": "biased",
-    "biassing": "biasing",
-    "bingeing": "binging",
-    "bougainvillaea": "bougainvillea",
-    "bougainvillaeas": "bougainvilleas",
-    "bowdlerise": "bowdlerize",
-    "bowdlerised": "bowdlerized",
-    "bowdlerises": "bowdlerizes",
-    "bowdlerising": "bowdlerizing",
-    "breathalyse": "breathalyze",
-    "breathalysed": "breathalyzed",
-    "breathalyser": "breathalyzer",
-    "breathalysers": "breathalyzers",
-    "breathalyses": "breathalyzes",
-    "breathalysing": "breathalyzing",
-    "brutalise": "brutalize",
-    "brutalised": "brutalized",
-    "brutalises": "brutalizes",
-    "brutalising": "brutalizing",
-    "busses": "buses",
-    "bussing": "busing",
-    "caesarean": "cesarean",
-    "caesareans": "cesareans",
-    "calibre": "caliber",
-    "calibres": "calibers",
-    "calliper": "caliper",
-    "callipers": "calipers",
-    "callisthenics": "calisthenics",
-    "canalise": "canalize",
-    "canalised": "canalized",
-    "canalises": "canalizes",
-    "canalising": "canalizing",
-    "cancelation": "cancellation",
-    "cancelations": "cancellations",
-    "cancelled": "canceled",
-    "cancelling": "canceling",
-    "candour": "candor",
-    "cannibalise": "cannibalize",
-    "cannibalised": "cannibalized",
-    "cannibalises": "cannibalizes",
-    "cannibalising": "cannibalizing",
-    "canonise": "canonize",
-    "canonised": "canonized",
-    "canonises": "canonizes",
-    "canonising": "canonizing",
-    "capitalise": "capitalize",
-    "capitalised": "capitalized",
-    "capitalises": "capitalizes",
-    "capitalising": "capitalizing",
-    "caramelise": "caramelize",
-    "caramelised": "caramelized",
-    "caramelises": "caramelizes",
-    "caramelising": "caramelizing",
-    "carbonise": "carbonize",
-    "carbonised": "carbonized",
-    "carbonises": "carbonizes",
-    "carbonising": "carbonizing",
-    "carolled": "caroled",
-    "carolling": "caroling",
-    "catalogue": "catalog",
-    "catalogued": "cataloged",
-    "catalogues": "catalogs",
-    "cataloguing": "cataloging",
-    "catalyse": "catalyze",
-    "catalysed": "catalyzed",
-    "catalyses": "catalyzes",
-    "catalysing": "catalyzing",
-    "categorise": "categorize",
-    "categorised": "categorized",
-    "categorises": "categorizes",
-    "categorising": "categorizing",
-    "cauterise": "cauterize",
-    "cauterised": "cauterized",
-    "cauterises": "cauterizes",
-    "cauterising": "cauterizing",
-    "cavilled": "caviled",
-    "cavilling": "caviling",
-    "centigramme": "centigram",
-    "centigrammes": "centigrams",
-    "centilitre": "centiliter",
-    "centilitres": "centiliters",
-    "centimetre": "centimeter",
-    "centimetres": "centimeters",
-    "centralise": "centralize",
-    "centralised": "centralized",
-    "centralises": "centralizes",
-    "centralising": "centralizing",
-    "centre": "center",
-    "centred": "centered",
-    "centrefold": "centerfold",
-    "centrefolds": "centerfolds",
-    "centrepiece": "centerpiece",
-    "centrepieces": "centerpieces",
-    "centres": "centers",
-    "channelled": "channeled",
-    "channelling": "channeling",
-    "characterise": "characterize",
-    "characterised": "characterized",
-    "characterises": "characterizes",
-    "characterising": "characterizing",
-    "cheque": "check",
-    "chequebook": "checkbook",
-    "chequebooks": "checkbooks",
-    "chequered": "checkered",
-    "cheques": "checks",
-    "chilli": "chili",
-    "chimaera": "chimera",
-    "chimaeras": "chimeras",
-    "chiselled": "chiseled",
-    "chiselling": "chiseling",
-    "circularise": "circularize",
-    "circularised": "circularized",
-    "circularises": "circularizes",
-    "circularising": "circularizing",
-    "civilise": "civilize",
-    "civilised": "civilized",
-    "civilises": "civilizes",
-    "civilising": "civilizing",
-    "clamour": "clamor",
-    "clamoured": "clamored",
-    "clamouring": "clamoring",
-    "clamours": "clamors",
-    "clangour": "clangor",
-    "clarinettist": "clarinetist",
-    "clarinettists": "clarinetists",
-    "collectivise": "collectivize",
-    "collectivised": "collectivized",
-    "collectivises": "collectivizes",
-    "collectivising": "collectivizing",
-    "colonisation": "colonization",
-    "colonise": "colonize",
-    "colonised": "colonized",
-    "coloniser": "colonizer",
-    "colonisers": "colonizers",
-    "colonises": "colonizes",
-    "colonising": "colonizing",
-    "colour": "color",
-    "colourant": "colorant",
-    "colourants": "colorants",
-    "coloured": "colored",
-    "coloureds": "coloreds",
-    "colourful": "colorful",
-    "colourfully": "colorfully",
-    "colouring": "coloring",
-    "colourize": "colorize",
-    "colourized": "colorized",
-    "colourizes": "colorizes",
-    "colourizing": "colorizing",
-    "colourless": "colorless",
-    "colours": "colors",
-    "commercialise": "commercialize",
-    "commercialised": "commercialized",
-    "commercialises": "commercializes",
-    "commercialising": "commercializing",
-    "compartmentalise": "compartmentalize",
-    "compartmentalised": "compartmentalized",
-    "compartmentalises": "compartmentalizes",
-    "compartmentalising": "compartmentalizing",
-    "computerise": "computerize",
-    "computerised": "computerized",
-    "computerises": "computerizes",
-    "computerising": "computerizing",
-    "conceptualise": "conceptualize",
-    "conceptualised": "conceptualized",
-    "conceptualises": "conceptualizes",
-    "conceptualising": "conceptualizing",
-    "connexion": "connection",
-    "connexions": "connections",
-    "contextualise": "contextualize",
-    "contextualised": "contextualized",
-    "contextualises": "contextualizes",
-    "contextualising": "contextualizing",
-    "cosier": "cozier",
-    "cosies": "cozies",
-    "cosiest": "coziest",
-    "cosily": "cozily",
-    "cosiness": "coziness",
-    "cosy": "cozy",
-    "councillor": "councilor",
-    "councillors": "councilors",
-    "counselled": "counseled",
-    "counselling": "counseling",
-    "counsellor": "counselor",
-    "counsellors": "counselors",
-    "crenelated": "crenellated",
-    "criminalise": "criminalize",
-    "criminalised": "criminalized",
-    "criminalises": "criminalizes",
-    "criminalising": "criminalizing",
-    "criticise": "criticize",
-    "criticised": "criticized",
-    "criticises": "criticizes",
-    "criticising": "criticizing",
-    "crueller": "crueler",
-    "cruellest": "cruelest",
-    "crystallisation": "crystallization",
-    "crystallise": "crystallize",
-    "crystallised": "crystallized",
-    "crystallises": "crystallizes",
-    "crystallising": "crystallizing",
-    "cudgelled": "cudgeled",
-    "cudgelling": "cudgeling",
-    "customise": "customize",
-    "customised": "customized",
-    "customises": "customizes",
-    "customising": "customizing",
-    "cypher": "cipher",
-    "cyphers": "ciphers",
-    "decentralisation": "decentralization",
-    "decentralise": "decentralize",
-    "decentralised": "decentralized",
-    "decentralises": "decentralizes",
-    "decentralising": "decentralizing",
-    "decriminalisation": "decriminalization",
-    "decriminalise": "decriminalize",
-    "decriminalised": "decriminalized",
-    "decriminalises": "decriminalizes",
-    "decriminalising": "decriminalizing",
-    "defence": "defense",
-    "defenceless": "defenseless",
-    "defences": "defenses",
-    "dehumanisation": "dehumanization",
-    "dehumanise": "dehumanize",
-    "dehumanised": "dehumanized",
-    "dehumanises": "dehumanizes",
-    "dehumanising": "dehumanizing",
-    "demeanour": "demeanor",
-    "demilitarisation": "demilitarization",
-    "demilitarise": "demilitarize",
-    "demilitarised": "demilitarized",
-    "demilitarises": "demilitarizes",
-    "demilitarising": "demilitarizing",
-    "demobilisation": "demobilization",
-    "demobilise": "demobilize",
-    "demobilised": "demobilized",
-    "demobilises": "demobilizes",
-    "demobilising": "demobilizing",
-    "democratisation": "democratization",
-    "democratise": "democratize",
-    "democratised": "democratized",
-    "democratises": "democratizes",
-    "democratising": "democratizing",
-    "demonise": "demonize",
-    "demonised": "demonized",
-    "demonises": "demonizes",
-    "demonising": "demonizing",
-    "demoralisation": "demoralization",
-    "demoralise": "demoralize",
-    "demoralised": "demoralized",
-    "demoralises": "demoralizes",
-    "demoralising": "demoralizing",
-    "denationalisation": "denationalization",
-    "denationalise": "denationalize",
-    "denationalised": "denationalized",
-    "denationalises": "denationalizes",
-    "denationalising": "denationalizing",
-    "deodorise": "deodorize",
-    "deodorised": "deodorized",
-    "deodorises": "deodorizes",
-    "deodorising": "deodorizing",
-    "depersonalise": "depersonalize",
-    "depersonalised": "depersonalized",
-    "depersonalises": "depersonalizes",
-    "depersonalising": "depersonalizing",
-    "deputise": "deputize",
-    "deputised": "deputized",
-    "deputises": "deputizes",
-    "deputising": "deputizing",
-    "desensitisation": "desensitization",
-    "desensitise": "desensitize",
-    "desensitised": "desensitized",
-    "desensitises": "desensitizes",
-    "desensitising": "desensitizing",
-    "destabilisation": "destabilization",
-    "destabilise": "destabilize",
-    "destabilised": "destabilized",
-    "destabilises": "destabilizes",
-    "destabilising": "destabilizing",
-    "dialled": "dialed",
-    "dialling": "dialing",
-    "dialogue": "dialog",
-    "dialogues": "dialogs",
-    "diarrhoea": "diarrhea",
-    "digitise": "digitize",
-    "digitised": "digitized",
-    "digitises": "digitizes",
-    "digitising": "digitizing",
-    "disc": "disk",
-    "discolour": "discolor",
-    "discoloured": "discolored",
-    "discolouring": "discoloring",
-    "discolours": "discolors",
-    "discs": "disks",
-    "disembowelled": "disemboweled",
-    "disembowelling": "disemboweling",
-    "disfavour": "disfavor",
-    "dishevelled": "disheveled",
-    "dishonour": "dishonor",
-    "dishonourable": "dishonorable",
-    "dishonourably": "dishonorably",
-    "dishonoured": "dishonored",
-    "dishonouring": "dishonoring",
-    "dishonours": "dishonors",
-    "disorganisation": "disorganization",
-    "disorganised": "disorganized",
-    "distil": "distill",
-    "distils": "distills",
-    "dramatisation": "dramatization",
-    "dramatisations": "dramatizations",
-    "dramatise": "dramatize",
-    "dramatised": "dramatized",
-    "dramatises": "dramatizes",
-    "dramatising": "dramatizing",
-    "draught": "draft",
-    "draughtboard": "draftboard",
-    "draughtboards": "draftboards",
-    "draughtier": "draftier",
-    "draughtiest": "draftiest",
-    "draughts": "drafts",
-    "draughtsman": "draftsman",
-    "draughtsmanship": "draftsmanship",
-    "draughtsmen": "draftsmen",
-    "draughtswoman": "draftswoman",
-    "draughtswomen": "draftswomen",
-    "draughty": "drafty",
-    "drivelled": "driveled",
-    "drivelling": "driveling",
-    "duelled": "dueled",
-    "duelling": "dueling",
-    "economise": "economize",
-    "economised": "economized",
-    "economises": "economizes",
-    "economising": "economizing",
-    "edoema": "edema",
-    "editorialise": "editorialize",
-    "editorialised": "editorialized",
-    "editorialises": "editorializes",
-    "editorialising": "editorializing",
-    "empathise": "empathize",
-    "empathised": "empathized",
-    "empathises": "empathizes",
-    "empathising": "empathizing",
-    "emphasise": "emphasize",
-    "emphasised": "emphasized",
-    "emphasises": "emphasizes",
-    "emphasising": "emphasizing",
-    "enamelled": "enameled",
-    "enamelling": "enameling",
-    "enamoured": "enamored",
-    "encyclopaedia": "encyclopedia",
-    "encyclopaedias": "encyclopedias",
-    "encyclopaedic": "encyclopedic",
-    "endeavour": "endeavor",
-    "endeavoured": "endeavored",
-    "endeavouring": "endeavoring",
-    "endeavours": "endeavors",
-    "energise": "energize",
-    "energised": "energized",
-    "energises": "energizes",
-    "energising": "energizing",
-    "enrol": "enroll",
-    "enrols": "enrolls",
-    "enthral": "enthrall",
-    "enthrals": "enthralls",
-    "epaulette": "epaulet",
-    "epaulettes": "epaulets",
-    "epicentre": "epicenter",
-    "epicentres": "epicenters",
-    "epilogue": "epilog",
-    "epilogues": "epilogs",
-    "epitomise": "epitomize",
-    "epitomised": "epitomized",
-    "epitomises": "epitomizes",
-    "epitomising": "epitomizing",
-    "equalisation": "equalization",
-    "equalise": "equalize",
-    "equalised": "equalized",
-    "equaliser": "equalizer",
-    "equalisers": "equalizers",
-    "equalises": "equalizes",
-    "equalising": "equalizing",
-    "eulogise": "eulogize",
-    "eulogised": "eulogized",
-    "eulogises": "eulogizes",
-    "eulogising": "eulogizing",
-    "evangelise": "evangelize",
-    "evangelised": "evangelized",
-    "evangelises": "evangelizes",
-    "evangelising": "evangelizing",
-    "exorcise": "exorcize",
-    "exorcised": "exorcized",
-    "exorcises": "exorcizes",
-    "exorcising": "exorcizing",
-    "extemporisation": "extemporization",
-    "extemporise": "extemporize",
-    "extemporised": "extemporized",
-    "extemporises": "extemporizes",
-    "extemporising": "extemporizing",
-    "externalisation": "externalization",
-    "externalisations": "externalizations",
-    "externalise": "externalize",
-    "externalised": "externalized",
-    "externalises": "externalizes",
-    "externalising": "externalizing",
-    "factorise": "factorize",
-    "factorised": "factorized",
-    "factorises": "factorizes",
-    "factorising": "factorizing",
-    "faecal": "fecal",
-    "faeces": "feces",
-    "familiarisation": "familiarization",
-    "familiarise": "familiarize",
-    "familiarised": "familiarized",
-    "familiarises": "familiarizes",
-    "familiarising": "familiarizing",
-    "fantasise": "fantasize",
-    "fantasised": "fantasized",
-    "fantasises": "fantasizes",
-    "fantasising": "fantasizing",
-    "favour": "favor",
-    "favourable": "favorable",
-    "favourably": "favorably",
-    "favoured": "favored",
-    "favouring": "favoring",
-    "favourite": "favorite",
-    "favourites": "favorites",
-    "favouritism": "favoritism",
-    "favours": "favors",
-    "feminise": "feminize",
-    "feminised": "feminized",
-    "feminises": "feminizes",
-    "feminising": "feminizing",
-    "fertilisation": "fertilization",
-    "fertilise": "fertilize",
-    "fertilised": "fertilized",
-    "fertiliser": "fertilizer",
-    "fertilisers": "fertilizers",
-    "fertilises": "fertilizes",
-    "fertilising": "fertilizing",
-    "fervour": "fervor",
-    "fibre": "fiber",
-    "fibreglass": "fiberglass",
-    "fibres": "fibers",
-    "fictionalisation": "fictionalization",
-    "fictionalisations": "fictionalizations",
-    "fictionalise": "fictionalize",
-    "fictionalised": "fictionalized",
-    "fictionalises": "fictionalizes",
-    "fictionalising": "fictionalizing",
-    "fillet": "filet",
-    "filleted": "fileted",
-    "filleting": "fileting",
-    "fillets": "filets",
-    "finalisation": "finalization",
-    "finalise": "finalize",
-    "finalised": "finalized",
-    "finalises": "finalizes",
-    "finalising": "finalizing",
-    "flautist": "flutist",
-    "flautists": "flutists",
-    "flavour": "flavor",
-    "flavoured": "flavored",
-    "flavouring": "flavoring",
-    "flavourings": "flavorings",
-    "flavourless": "flavorless",
-    "flavours": "flavors",
-    "flavoursome": "flavorsome",
-    "flyer / flier": "flier / flyer",
-    "foetal": "fetal",
-    "foetid": "fetid",
-    "foetus": "fetus",
-    "foetuses": "fetuses",
-    "formalisation": "formalization",
-    "formalise": "formalize",
-    "formalised": "formalized",
-    "formalises": "formalizes",
-    "formalising": "formalizing",
-    "fossilisation": "fossilization",
-    "fossilise": "fossilize",
-    "fossilised": "fossilized",
-    "fossilises": "fossilizes",
-    "fossilising": "fossilizing",
-    "fraternisation": "fraternization",
-    "fraternise": "fraternize",
-    "fraternised": "fraternized",
-    "fraternises": "fraternizes",
-    "fraternising": "fraternizing",
-    "fulfil": "fulfill",
-    "fulfilment": "fulfillment",
-    "fulfils": "fulfills",
-    "funnelled": "funneled",
-    "funnelling": "funneling",
-    "galvanise": "galvanize",
-    "galvanised": "galvanized",
-    "galvanises": "galvanizes",
-    "galvanising": "galvanizing",
-    "gambolled": "gamboled",
-    "gambolling": "gamboling",
-    "gaol": "jail",
-    "gaolbird": "jailbird",
-    "gaolbirds": "jailbirds",
-    "gaolbreak": "jailbreak",
-    "gaolbreaks": "jailbreaks",
-    "gaoled": "jailed",
-    "gaoler": "jailer",
-    "gaolers": "jailers",
-    "gaoling": "jailing",
-    "gaols": "jails",
-    "gasses": "gases",
-    "gage": "gauge",
-    "gaged": "gauged",
-    "gages": "gauges",
-    "gaging": "gauging",
-    "generalisation": "generalization",
-    "generalisations": "generalizations",
-    "generalise": "generalize",
-    "generalised": "generalized",
-    "generalises": "generalizes",
-    "generalising": "generalizing",
-    "ghettoise": "ghettoize",
-    "ghettoised": "ghettoized",
-    "ghettoises": "ghettoizes",
-    "ghettoising": "ghettoizing",
-    "gipsies": "gypsies",
-    "glamorise": "glamorize",
-    "glamorised": "glamorized",
-    "glamorises": "glamorizes",
-    "glamorising": "glamorizing",
-    "glamor": "glamour",
-    "globalisation": "globalization",
-    "globalise": "globalize",
-    "globalised": "globalized",
-    "globalises": "globalizes",
-    "globalising": "globalizing",
-    "glueing": "gluing",
-    "goitre": "goiter",
-    "goitres": "goiters",
-    "gonorrhoea": "gonorrhea",
-    "gramme": "gram",
-    "grammes": "grams",
-    "gravelled": "graveled",
-    "grey": "gray",
-    "greyed": "grayed",
-    "greying": "graying",
-    "greyish": "grayish",
-    "greyness": "grayness",
-    "greys": "grays",
-    "grovelled": "groveled",
-    "grovelling": "groveling",
-    "groyne": "groin",
-    "groynes": "groins",
-    "gruelling": "grueling",
-    "gruellingly": "gruelingly",
-    "gryphon": "griffin",
-    "gryphons": "griffins",
-    "gynaecological": "gynecological",
-    "gynaecologist": "gynecologist",
-    "gynaecologists": "gynecologists",
-    "gynaecology": "gynecology",
-    "haematological": "hematological",
-    "haematologist": "hematologist",
-    "haematologists": "hematologists",
-    "haematology": "hematology",
-    "haemoglobin": "hemoglobin",
-    "haemophilia": "hemophilia",
-    "haemophiliac": "hemophiliac",
-    "haemophiliacs": "hemophiliacs",
-    "haemorrhage": "hemorrhage",
-    "haemorrhaged": "hemorrhaged",
-    "haemorrhages": "hemorrhages",
-    "haemorrhaging": "hemorrhaging",
-    "haemorrhoids": "hemorrhoids",
-    "harbour": "harbor",
-    "harboured": "harbored",
-    "harbouring": "harboring",
-    "harbours": "harbors",
-    "harmonisation": "harmonization",
-    "harmonise": "harmonize",
-    "harmonised": "harmonized",
-    "harmonises": "harmonizes",
-    "harmonising": "harmonizing",
-    "homoeopath": "homeopath",
-    "homoeopathic": "homeopathic",
-    "homoeopaths": "homeopaths",
-    "homoeopathy": "homeopathy",
-    "homogenise": "homogenize",
-    "homogenised": "homogenized",
-    "homogenises": "homogenizes",
-    "homogenising": "homogenizing",
-    "honour": "honor",
-    "honourable": "honorable",
-    "honourably": "honorably",
-    "honoured": "honored",
-    "honouring": "honoring",
-    "honours": "honors",
-    "hospitalisation": "hospitalization",
-    "hospitalise": "hospitalize",
-    "hospitalised": "hospitalized",
-    "hospitalises": "hospitalizes",
-    "hospitalising": "hospitalizing",
-    "humanise": "humanize",
-    "humanised": "humanized",
-    "humanises": "humanizes",
-    "humanising": "humanizing",
-    "humour": "humor",
-    "humoured": "humored",
-    "humouring": "humoring",
-    "humourless": "humorless",
-    "humours": "humors",
-    "hybridise": "hybridize",
-    "hybridised": "hybridized",
-    "hybridises": "hybridizes",
-    "hybridising": "hybridizing",
-    "hypnotise": "hypnotize",
-    "hypnotised": "hypnotized",
-    "hypnotises": "hypnotizes",
-    "hypnotising": "hypnotizing",
-    "hypothesise": "hypothesize",
-    "hypothesised": "hypothesized",
-    "hypothesises": "hypothesizes",
-    "hypothesising": "hypothesizing",
-    "idealisation": "idealization",
-    "idealise": "idealize",
-    "idealised": "idealized",
-    "idealises": "idealizes",
-    "idealising": "idealizing",
-    "idolise": "idolize",
-    "idolised": "idolized",
-    "idolises": "idolizes",
-    "idolising": "idolizing",
-    "immobilisation": "immobilization",
-    "immobilise": "immobilize",
-    "immobilised": "immobilized",
-    "immobiliser": "immobilizer",
-    "immobilisers": "immobilizers",
-    "immobilises": "immobilizes",
-    "immobilising": "immobilizing",
-    "immortalise": "immortalize",
-    "immortalised": "immortalized",
-    "immortalises": "immortalizes",
-    "immortalising": "immortalizing",
-    "immunisation": "immunization",
-    "immunise": "immunize",
-    "immunised": "immunized",
-    "immunises": "immunizes",
-    "immunising": "immunizing",
-    "impanelled": "impaneled",
-    "impanelling": "impaneling",
-    "imperilled": "imperiled",
-    "imperilling": "imperiling",
-    "individualise": "individualize",
-    "individualised": "individualized",
-    "individualises": "individualizes",
-    "individualising": "individualizing",
-    "industrialise": "industrialize",
-    "industrialised": "industrialized",
-    "industrialises": "industrializes",
-    "industrialising": "industrializing",
-    "inflexion": "inflection",
-    "inflexions": "inflections",
-    "initialise": "initialize",
-    "initialised": "initialized",
-    "initialises": "initializes",
-    "initialising": "initializing",
-    "initialled": "initialed",
-    "initialling": "initialing",
-    "instal": "install",
-    "instalment": "installment",
-    "instalments": "installments",
-    "instals": "installs",
-    "instil": "instill",
-    "instils": "instills",
-    "institutionalisation": "institutionalization",
-    "institutionalise": "institutionalize",
-    "institutionalised": "institutionalized",
-    "institutionalises": "institutionalizes",
-    "institutionalising": "institutionalizing",
-    "intellectualise": "intellectualize",
-    "intellectualised": "intellectualized",
-    "intellectualises": "intellectualizes",
-    "intellectualising": "intellectualizing",
-    "internalisation": "internalization",
-    "internalise": "internalize",
-    "internalised": "internalized",
-    "internalises": "internalizes",
-    "internalising": "internalizing",
-    "internationalisation": "internationalization",
-    "internationalise": "internationalize",
-    "internationalised": "internationalized",
-    "internationalises": "internationalizes",
-    "internationalising": "internationalizing",
-    "ionisation": "ionization",
-    "ionise": "ionize",
-    "ionised": "ionized",
-    "ioniser": "ionizer",
-    "ionisers": "ionizers",
-    "ionises": "ionizes",
-    "ionising": "ionizing",
-    "italicise": "italicize",
-    "italicised": "italicized",
-    "italicises": "italicizes",
-    "italicising": "italicizing",
-    "itemise": "itemize",
-    "itemised": "itemized",
-    "itemises": "itemizes",
-    "itemising": "itemizing",
-    "jeopardise": "jeopardize",
-    "jeopardised": "jeopardized",
-    "jeopardises": "jeopardizes",
-    "jeopardising": "jeopardizing",
-    "jewelled": "jeweled",
-    "jeweller": "jeweler",
-    "jewellers": "jewelers",
-    "jewellery": "jewelry",
-    "judgement": "judgment",
-    "kilogramme": "kilogram",
-    "kilogrammes": "kilograms",
-    "kilometre": "kilometer",
-    "kilometres": "kilometers",
-    "labelled": "labeled",
-    "labelling": "labeling",
-    "labour": "labor",
-    "laboured": "labored",
-    "labourer": "laborer",
-    "labourers": "laborers",
-    "labouring": "laboring",
-    "labours": "labors",
-    "lacklustre": "lackluster",
-    "legalisation": "legalization",
-    "legalise": "legalize",
-    "legalised": "legalized",
-    "legalises": "legalizes",
-    "legalising": "legalizing",
-    "legitimise": "legitimize",
-    "legitimised": "legitimized",
-    "legitimises": "legitimizes",
-    "legitimising": "legitimizing",
-    "leukaemia": "leukemia",
-    "levelled": "leveled",
-    "leveller": "leveler",
-    "levellers": "levelers",
-    "levelling": "leveling",
-    "libelled": "libeled",
-    "libelling": "libeling",
-    "libellous": "libelous",
-    "liberalisation": "liberalization",
-    "liberalise": "liberalize",
-    "liberalised": "liberalized",
-    "liberalises": "liberalizes",
-    "liberalising": "liberalizing",
-    "licence": "license",
-    "licenced": "licensed",
-    "licences": "licenses",
-    "licencing": "licensing",
-    "likeable": "likable",
-    "lionisation": "lionization",
-    "lionise": "lionize",
-    "lionised": "lionized",
-    "lionises": "lionizes",
-    "lionising": "lionizing",
-    "liquidise": "liquidize",
-    "liquidised": "liquidized",
-    "liquidiser": "liquidizer",
-    "liquidisers": "liquidizers",
-    "liquidises": "liquidizes",
-    "liquidising": "liquidizing",
-    "litre": "liter",
-    "litres": "liters",
-    "localise": "localize",
-    "localised": "localized",
-    "localises": "localizes",
-    "localising": "localizing",
-    "louvre": "louver",
-    "louvred": "louvered",
-    "louvres": "louvers",
-    "lustre": "luster",
-    "magnetise": "magnetize",
-    "magnetised": "magnetized",
-    "magnetises": "magnetizes",
-    "magnetising": "magnetizing",
-    "manoeuvrability": "maneuverability",
-    "manoeuvrable": "maneuverable",
-    "manoeuvre": "maneuver",
-    "manoeuvred": "maneuvered",
-    "manoeuvres": "maneuvers",
-    "manoeuvring": "maneuvering",
-    "manoeuvrings": "maneuverings",
-    "marginalisation": "marginalization",
-    "marginalise": "marginalize",
-    "marginalised": "marginalized",
-    "marginalises": "marginalizes",
-    "marginalising": "marginalizing",
-    "marshalled": "marshaled",
-    "marshalling": "marshaling",
-    "marvelled": "marveled",
-    "marvelling": "marveling",
-    "marvellous": "marvelous",
-    "marvellously": "marvelously",
-    "materialisation": "materialization",
-    "materialise": "materialize",
-    "materialised": "materialized",
-    "materialises": "materializes",
-    "materialising": "materializing",
-    "maximisation": "maximization",
-    "maximise": "maximize",
-    "maximised": "maximized",
-    "maximises": "maximizes",
-    "maximising": "maximizing",
-    "meagre": "meager",
-    "mechanisation": "mechanization",
-    "mechanise": "mechanize",
-    "mechanised": "mechanized",
-    "mechanises": "mechanizes",
-    "mechanising": "mechanizing",
-    "mediaeval": "medieval",
-    "memorialise": "memorialize",
-    "memorialised": "memorialized",
-    "memorialises": "memorializes",
-    "memorialising": "memorializing",
-    "memorise": "memorize",
-    "memorised": "memorized",
-    "memorises": "memorizes",
-    "memorising": "memorizing",
-    "mesmerise": "mesmerize",
-    "mesmerised": "mesmerized",
-    "mesmerises": "mesmerizes",
-    "mesmerising": "mesmerizing",
-    "metabolise": "metabolize",
-    "metabolised": "metabolized",
-    "metabolises": "metabolizes",
-    "metabolising": "metabolizing",
-    "metre": "meter",
-    "metres": "meters",
-    "micrometre": "micrometer",
-    "micrometres": "micrometers",
-    "militarise": "militarize",
-    "militarised": "militarized",
-    "militarises": "militarizes",
-    "militarising": "militarizing",
-    "milligramme": "milligram",
-    "milligrammes": "milligrams",
-    "millilitre": "milliliter",
-    "millilitres": "milliliters",
-    "millimetre": "millimeter",
-    "millimetres": "millimeters",
-    "miniaturisation": "miniaturization",
-    "miniaturise": "miniaturize",
-    "miniaturised": "miniaturized",
-    "miniaturises": "miniaturizes",
-    "miniaturising": "miniaturizing",
-    "minibusses": "minibuses",
-    "minimise": "minimize",
-    "minimised": "minimized",
-    "minimises": "minimizes",
-    "minimising": "minimizing",
-    "misbehaviour": "misbehavior",
-    "misdemeanour": "misdemeanor",
-    "misdemeanours": "misdemeanors",
-    "misspelt": "misspelled",
-    "mitre": "miter",
-    "mitres": "miters",
-    "mobilisation": "mobilization",
-    "mobilise": "mobilize",
-    "mobilised": "mobilized",
-    "mobilises": "mobilizes",
-    "mobilising": "mobilizing",
-    "modelled": "modeled",
-    "modeller": "modeler",
-    "modellers": "modelers",
-    "modelling": "modeling",
-    "modernise": "modernize",
-    "modernised": "modernized",
-    "modernises": "modernizes",
-    "modernising": "modernizing",
-    "moisturise": "moisturize",
-    "moisturised": "moisturized",
-    "moisturiser": "moisturizer",
-    "moisturisers": "moisturizers",
-    "moisturises": "moisturizes",
-    "moisturising": "moisturizing",
-    "monologue": "monolog",
-    "monologues": "monologs",
-    "monopolisation": "monopolization",
-    "monopolise": "monopolize",
-    "monopolised": "monopolized",
-    "monopolises": "monopolizes",
-    "monopolising": "monopolizing",
-    "moralise": "moralize",
-    "moralised": "moralized",
-    "moralises": "moralizes",
-    "moralising": "moralizing",
-    "motorised": "motorized",
-    "mould": "mold",
-    "moulded": "molded",
-    "moulder": "molder",
-    "mouldered": "moldered",
-    "mouldering": "moldering",
-    "moulders": "molders",
-    "mouldier": "moldier",
-    "mouldiest": "moldiest",
-    "moulding": "molding",
-    "mouldings": "moldings",
-    "moulds": "molds",
-    "mouldy": "moldy",
-    "moult": "molt",
-    "moulted": "molted",
-    "moulting": "molting",
-    "moults": "molts",
-    "moustache": "mustache",
-    "moustached": "mustached",
-    "moustaches": "mustaches",
-    "moustachioed": "mustachioed",
-    "multicoloured": "multicolored",
-    "nationalisation": "nationalization",
-    "nationalisations": "nationalizations",
-    "nationalise": "nationalize",
-    "nationalised": "nationalized",
-    "nationalises": "nationalizes",
-    "nationalising": "nationalizing",
-    "naturalisation": "naturalization",
-    "naturalise": "naturalize",
-    "naturalised": "naturalized",
-    "naturalises": "naturalizes",
-    "naturalising": "naturalizing",
-    "neighbour": "neighbor",
-    "neighbourhood": "neighborhood",
-    "neighbourhoods": "neighborhoods",
-    "neighbouring": "neighboring",
-    "neighbourliness": "neighborliness",
-    "neighbourly": "neighborly",
-    "neighbours": "neighbors",
-    "neutralisation": "neutralization",
-    "neutralise": "neutralize",
-    "neutralised": "neutralized",
-    "neutralises": "neutralizes",
-    "neutralising": "neutralizing",
-    "normalisation": "normalization",
-    "normalise": "normalize",
-    "normalised": "normalized",
-    "normalises": "normalizes",
-    "normalising": "normalizing",
-    "odour": "odor",
-    "odourless": "odorless",
-    "odours": "odors",
-    "oesophagus": "esophagus",
-    "oesophaguses": "esophaguses",
-    "oestrogen": "estrogen",
-    "offence": "offense",
-    "offences": "offenses",
-    "omelette": "omelet",
-    "omelettes": "omelets",
-    "optimise": "optimize",
-    "optimised": "optimized",
-    "optimises": "optimizes",
-    "optimising": "optimizing",
-    "organisation": "organization",
-    "organisational": "organizational",
-    "organisations": "organizations",
-    "organise": "organize",
-    "organised": "organized",
-    "organiser": "organizer",
-    "organisers": "organizers",
-    "organises": "organizes",
-    "organising": "organizing",
-    "orthopaedic": "orthopedic",
-    "orthopaedics": "orthopedics",
-    "ostracise": "ostracize",
-    "ostracised": "ostracized",
-    "ostracises": "ostracizes",
-    "ostracising": "ostracizing",
-    "outmanoeuvre": "outmaneuver",
-    "outmanoeuvred": "outmaneuvered",
-    "outmanoeuvres": "outmaneuvers",
-    "outmanoeuvring": "outmaneuvering",
-    "overemphasise": "overemphasize",
-    "overemphasised": "overemphasized",
-    "overemphasises": "overemphasizes",
-    "overemphasising": "overemphasizing",
-    "oxidisation": "oxidization",
-    "oxidise": "oxidize",
-    "oxidised": "oxidized",
-    "oxidises": "oxidizes",
-    "oxidising": "oxidizing",
-    "paederast": "pederast",
-    "paederasts": "pederasts",
-    "paediatric": "pediatric",
-    "paediatrician": "pediatrician",
-    "paediatricians": "pediatricians",
-    "paediatrics": "pediatrics",
-    "paedophile": "pedophile",
-    "paedophiles": "pedophiles",
-    "paedophilia": "pedophilia",
-    "palaeolithic": "paleolithic",
-    "palaeontologist": "paleontologist",
-    "palaeontologists": "paleontologists",
-    "palaeontology": "paleontology",
-    "panelled": "paneled",
-    "panelling": "paneling",
-    "panellist": "panelist",
-    "panellists": "panelists",
-    "paralyse": "paralyze",
-    "paralysed": "paralyzed",
-    "paralyses": "paralyzes",
-    "paralysing": "paralyzing",
-    "parcelled": "parceled",
-    "parcelling": "parceling",
-    "parlour": "parlor",
-    "parlours": "parlors",
-    "particularise": "particularize",
-    "particularised": "particularized",
-    "particularises": "particularizes",
-    "particularising": "particularizing",
-    "passivisation": "passivization",
-    "passivise": "passivize",
-    "passivised": "passivized",
-    "passivises": "passivizes",
-    "passivising": "passivizing",
-    "pasteurisation": "pasteurization",
-    "pasteurise": "pasteurize",
-    "pasteurised": "pasteurized",
-    "pasteurises": "pasteurizes",
-    "pasteurising": "pasteurizing",
-    "patronise": "patronize",
-    "patronised": "patronized",
-    "patronises": "patronizes",
-    "patronising": "patronizing",
-    "patronisingly": "patronizingly",
-    "pedalled": "pedaled",
-    "pedalling": "pedaling",
-    "pedestrianisation": "pedestrianization",
-    "pedestrianise": "pedestrianize",
-    "pedestrianised": "pedestrianized",
-    "pedestrianises": "pedestrianizes",
-    "pedestrianising": "pedestrianizing",
-    "penalise": "penalize",
-    "penalised": "penalized",
-    "penalises": "penalizes",
-    "penalising": "penalizing",
-    "pencilled": "penciled",
-    "pencilling": "penciling",
-    "personalise": "personalize",
-    "personalised": "personalized",
-    "personalises": "personalizes",
-    "personalising": "personalizing",
-    "pharmacopoeia": "pharmacopeia",
-    "pharmacopoeias": "pharmacopeias",
-    "philosophise": "philosophize",
-    "philosophised": "philosophized",
-    "philosophises": "philosophizes",
-    "philosophising": "philosophizing",
-    "philtre": "filter",
-    "philtres": "filters",
-    "phoney": "phony",
-    "plagiarise": "plagiarize",
-    "plagiarised": "plagiarized",
-    "plagiarises": "plagiarizes",
-    "plagiarising": "plagiarizing",
-    "plough": "plow",
-    "ploughed": "plowed",
-    "ploughing": "plowing",
-    "ploughman": "plowman",
-    "ploughmen": "plowmen",
-    "ploughs": "plows",
-    "ploughshare": "plowshare",
-    "ploughshares": "plowshares",
-    "polarisation": "polarization",
-    "polarise": "polarize",
-    "polarised": "polarized",
-    "polarises": "polarizes",
-    "polarising": "polarizing",
-    "politicisation": "politicization",
-    "politicise": "politicize",
-    "politicised": "politicized",
-    "politicises": "politicizes",
-    "politicising": "politicizing",
-    "popularisation": "popularization",
-    "popularise": "popularize",
-    "popularised": "popularized",
-    "popularises": "popularizes",
-    "popularising": "popularizing",
-    "pouffe": "pouf",
-    "pouffes": "poufs",
-    "practise": "practice",
-    "practised": "practiced",
-    "practises": "practices",
-    "practising": "practicing",
-    "praesidium": "presidium",
-    "praesidiums": "presidiums",
-    "pressurisation": "pressurization",
-    "pressurise": "pressurize",
-    "pressurised": "pressurized",
-    "pressurises": "pressurizes",
-    "pressurising": "pressurizing",
-    "pretence": "pretense",
-    "pretences": "pretenses",
-    "primaeval": "primeval",
-    "prioritisation": "prioritization",
-    "prioritise": "prioritize",
-    "prioritised": "prioritized",
-    "prioritises": "prioritizes",
-    "prioritising": "prioritizing",
-    "privatisation": "privatization",
-    "privatisations": "privatizations",
-    "privatise": "privatize",
-    "privatised": "privatized",
-    "privatises": "privatizes",
-    "privatising": "privatizing",
-    "professionalisation": "professionalization",
-    "professionalise": "professionalize",
-    "professionalised": "professionalized",
-    "professionalises": "professionalizes",
-    "professionalising": "professionalizing",
-    "programme": "program",
-    "programmes": "programs",
-    "prologue": "prolog",
-    "prologues": "prologs",
-    "propagandise": "propagandize",
-    "propagandised": "propagandized",
-    "propagandises": "propagandizes",
-    "propagandising": "propagandizing",
-    "proselytise": "proselytize",
-    "proselytised": "proselytized",
-    "proselytiser": "proselytizer",
-    "proselytisers": "proselytizers",
-    "proselytises": "proselytizes",
-    "proselytising": "proselytizing",
-    "psychoanalyse": "psychoanalyze",
-    "psychoanalysed": "psychoanalyzed",
-    "psychoanalyses": "psychoanalyzes",
-    "psychoanalysing": "psychoanalyzing",
-    "publicise": "publicize",
-    "publicised": "publicized",
-    "publicises": "publicizes",
-    "publicising": "publicizing",
-    "pulverisation": "pulverization",
-    "pulverise": "pulverize",
-    "pulverised": "pulverized",
-    "pulverises": "pulverizes",
-    "pulverising": "pulverizing",
-    "pummelled": "pummel",
-    "pummelling": "pummeled",
-    "pyjama": "pajama",
-    "pyjamas": "pajamas",
-    "pzazz": "pizzazz",
-    "quarrelled": "quarreled",
-    "quarrelling": "quarreling",
-    "radicalise": "radicalize",
-    "radicalised": "radicalized",
-    "radicalises": "radicalizes",
-    "radicalising": "radicalizing",
-    "rancour": "rancor",
-    "randomise": "randomize",
-    "randomised": "randomized",
-    "randomises": "randomizes",
-    "randomising": "randomizing",
-    "rationalisation": "rationalization",
-    "rationalisations": "rationalizations",
-    "rationalise": "rationalize",
-    "rationalised": "rationalized",
-    "rationalises": "rationalizes",
-    "rationalising": "rationalizing",
-    "ravelled": "raveled",
-    "ravelling": "raveling",
-    "realisable": "realizable",
-    "realisation": "realization",
-    "realisations": "realizations",
-    "realise": "realize",
-    "realised": "realized",
-    "realises": "realizes",
-    "realising": "realizing",
-    "recognisable": "recognizable",
-    "recognisably": "recognizably",
-    "recognisance": "recognizance",
-    "recognise": "recognize",
-    "recognised": "recognized",
-    "recognises": "recognizes",
-    "recognising": "recognizing",
-    "reconnoitre": "reconnoiter",
-    "reconnoitred": "reconnoitered",
-    "reconnoitres": "reconnoiters",
-    "reconnoitring": "reconnoitering",
-    "refuelled": "refueled",
-    "refuelling": "refueling",
-    "regularisation": "regularization",
-    "regularise": "regularize",
-    "regularised": "regularized",
-    "regularises": "regularizes",
-    "regularising": "regularizing",
-    "remodelled": "remodeled",
-    "remodelling": "remodeling",
-    "remould": "remold",
-    "remoulded": "remolded",
-    "remoulding": "remolding",
-    "remoulds": "remolds",
-    "reorganisation": "reorganization",
-    "reorganisations": "reorganizations",
-    "reorganise": "reorganize",
-    "reorganised": "reorganized",
-    "reorganises": "reorganizes",
-    "reorganising": "reorganizing",
-    "revelled": "reveled",
-    "reveller": "reveler",
-    "revellers": "revelers",
-    "revelling": "reveling",
-    "revitalise": "revitalize",
-    "revitalised": "revitalized",
-    "revitalises": "revitalizes",
-    "revitalising": "revitalizing",
-    "revolutionise": "revolutionize",
-    "revolutionised": "revolutionized",
-    "revolutionises": "revolutionizes",
-    "revolutionising": "revolutionizing",
-    "rhapsodise": "rhapsodize",
-    "rhapsodised": "rhapsodized",
-    "rhapsodises": "rhapsodizes",
-    "rhapsodising": "rhapsodizing",
-    "rigour": "rigor",
-    "rigours": "rigors",
-    "ritualised": "ritualized",
-    "rivalled": "rivaled",
-    "rivalling": "rivaling",
-    "romanticise": "romanticize",
-    "romanticised": "romanticized",
-    "romanticises": "romanticizes",
-    "romanticising": "romanticizing",
-    "rumour": "rumor",
-    "rumoured": "rumored",
-    "rumours": "rumors",
-    "sabre": "saber",
-    "sabres": "sabers",
-    "saltpetre": "saltpeter",
-    "sanitise": "sanitize",
-    "sanitised": "sanitized",
-    "sanitises": "sanitizes",
-    "sanitising": "sanitizing",
-    "satirise": "satirize",
-    "satirised": "satirized",
-    "satirises": "satirizes",
-    "satirising": "satirizing",
-    "saviour": "savior",
-    "saviours": "saviors",
-    "savour": "savor",
-    "savoured": "savored",
-    "savouries": "savories",
-    "savouring": "savoring",
-    "savours": "savors",
-    "savoury": "savory",
-    "scandalise": "scandalize",
-    "scandalised": "scandalized",
-    "scandalises": "scandalizes",
-    "scandalising": "scandalizing",
-    "sceptic": "skeptic",
-    "sceptical": "skeptical",
-    "sceptically": "skeptically",
-    "scepticism": "skepticism",
-    "sceptics": "skeptics",
-    "sceptre": "scepter",
-    "sceptres": "scepters",
-    "scrutinise": "scrutinize",
-    "scrutinised": "scrutinized",
-    "scrutinises": "scrutinizes",
-    "scrutinising": "scrutinizing",
-    "secularisation": "secularization",
-    "secularise": "secularize",
-    "secularised": "secularized",
-    "secularises": "secularizes",
-    "secularising": "secularizing",
-    "sensationalise": "sensationalize",
-    "sensationalised": "sensationalized",
-    "sensationalises": "sensationalizes",
-    "sensationalising": "sensationalizing",
-    "sensitise": "sensitize",
-    "sensitised": "sensitized",
-    "sensitises": "sensitizes",
-    "sensitising": "sensitizing",
-    "sentimentalise": "sentimentalize",
-    "sentimentalised": "sentimentalized",
-    "sentimentalises": "sentimentalizes",
-    "sentimentalising": "sentimentalizing",
-    "sepulchre": "sepulcher",
-    "sepulchres": "sepulchers",
-    "serialisation": "serialization",
-    "serialisations": "serializations",
-    "serialise": "serialize",
-    "serialised": "serialized",
-    "serialises": "serializes",
-    "serialising": "serializing",
-    "sermonise": "sermonize",
-    "sermonised": "sermonized",
-    "sermonises": "sermonizes",
-    "sermonising": "sermonizing",
-    "sheikh": "sheik",
-    "shovelled": "shoveled",
-    "shovelling": "shoveling",
-    "shrivelled": "shriveled",
-    "shrivelling": "shriveling",
-    "signalise": "signalize",
-    "signalised": "signalized",
-    "signalises": "signalizes",
-    "signalising": "signalizing",
-    "signalled": "signaled",
-    "signalling": "signaling",
-    "smoulder": "smolder",
-    "smouldered": "smoldered",
-    "smouldering": "smoldering",
-    "smoulders": "smolders",
-    "snivelled": "sniveled",
-    "snivelling": "sniveling",
-    "snorkelled": "snorkeled",
-    "snorkelling": "snorkeling",
-    "snowplough": "snowplow",
-    "snowploughs": "snowplow",
-    "socialisation": "socialization",
-    "socialise": "socialize",
-    "socialised": "socialized",
-    "socialises": "socializes",
-    "socialising": "socializing",
-    "sodomise": "sodomize",
-    "sodomised": "sodomized",
-    "sodomises": "sodomizes",
-    "sodomising": "sodomizing",
-    "solemnise": "solemnize",
-    "solemnised": "solemnized",
-    "solemnises": "solemnizes",
-    "solemnising": "solemnizing",
-    "sombre": "somber",
-    "specialisation": "specialization",
-    "specialisations": "specializations",
-    "specialise": "specialize",
-    "specialised": "specialized",
-    "specialises": "specializes",
-    "specialising": "specializing",
-    "spectre": "specter",
-    "spectres": "specters",
-    "spiralled": "spiraled",
-    "spiralling": "spiraling",
-    "splendour": "splendor",
-    "splendours": "splendors",
-    "squirrelled": "squirreled",
-    "squirrelling": "squirreling",
-    "stabilisation": "stabilization",
-    "stabilise": "stabilize",
-    "stabilised": "stabilized",
-    "stabiliser": "stabilizer",
-    "stabilisers": "stabilizers",
-    "stabilises": "stabilizes",
-    "stabilising": "stabilizing",
-    "standardisation": "standardization",
-    "standardise": "standardize",
-    "standardised": "standardized",
-    "standardises": "standardizes",
-    "standardising": "standardizing",
-    "stencilled": "stenciled",
-    "stencilling": "stenciling",
-    "sterilisation": "sterilization",
-    "sterilisations": "sterilizations",
-    "sterilise": "sterilize",
-    "sterilised": "sterilized",
-    "steriliser": "sterilizer",
-    "sterilisers": "sterilizers",
-    "sterilises": "sterilizes",
-    "sterilising": "sterilizing",
-    "stigmatisation": "stigmatization",
-    "stigmatise": "stigmatize",
-    "stigmatised": "stigmatized",
-    "stigmatises": "stigmatizes",
-    "stigmatising": "stigmatizing",
-    "storey": "story",
-    "storeys": "stories",
-    "subsidisation": "subsidization",
-    "subsidise": "subsidize",
-    "subsidised": "subsidized",
-    "subsidiser": "subsidizer",
-    "subsidisers": "subsidizers",
-    "subsidises": "subsidizes",
-    "subsidising": "subsidizing",
-    "succour": "succor",
-    "succoured": "succored",
-    "succouring": "succoring",
-    "succours": "succors",
-    "sulphate": "sulfate",
-    "sulphates": "sulfates",
-    "sulphide": "sulfide",
-    "sulphides": "sulfides",
-    "sulphur": "sulfur",
-    "sulphurous": "sulfurous",
-    "summarise": "summarize",
-    "summarised": "summarized",
-    "summarises": "summarizes",
-    "summarising": "summarizing",
-    "swivelled": "swiveled",
-    "swivelling": "swiveling",
-    "symbolise": "symbolize",
-    "symbolised": "symbolized",
-    "symbolises": "symbolizes",
-    "symbolising": "symbolizing",
-    "sympathise": "sympathize",
-    "sympathised": "sympathized",
-    "sympathiser": "sympathizer",
-    "sympathisers": "sympathizers",
-    "sympathises": "sympathizes",
-    "sympathising": "sympathizing",
-    "synchronisation": "synchronization",
-    "synchronise": "synchronize",
-    "synchronised": "synchronized",
-    "synchronises": "synchronizes",
-    "synchronising": "synchronizing",
-    "synthesise": "synthesize",
-    "synthesised": "synthesized",
-    "synthesiser": "synthesizer",
-    "synthesisers": "synthesizers",
-    "synthesises": "synthesizes",
-    "synthesising": "synthesizing",
-    "syphon": "siphon",
-    "syphoned": "siphoned",
-    "syphoning": "siphoning",
-    "syphons": "siphons",
-    "systematisation": "systematization",
-    "systematise": "systematize",
-    "systematised": "systematized",
-    "systematises": "systematizes",
-    "systematising": "systematizing",
-    "tantalise": "tantalize",
-    "tantalised": "tantalized",
-    "tantalises": "tantalizes",
-    "tantalising": "tantalizing",
-    "tantalisingly": "tantalizingly",
-    "tasselled": "tasseled",
-    "technicolour": "technicolor",
-    "temporise": "temporize",
-    "temporised": "temporized",
-    "temporises": "temporizes",
-    "temporising": "temporizing",
-    "tenderise": "tenderize",
-    "tenderised": "tenderized",
-    "tenderises": "tenderizes",
-    "tenderising": "tenderizing",
-    "terrorise": "terrorize",
-    "terrorised": "terrorized",
-    "terrorises": "terrorizes",
-    "terrorising": "terrorizing",
-    "theatre": "theater",
-    "theatregoer": "theatergoer",
-    "theatregoers": "theatergoers",
-    "theatres": "theaters",
-    "theorise": "theorize",
-    "theorised": "theorized",
-    "theorises": "theorizes",
-    "theorising": "theorizing",
-    "tonne": "ton",
-    "tonnes": "tons",
-    "towelled": "toweled",
-    "towelling": "toweling",
-    "toxaemia": "toxemia",
-    "tranquillise": "tranquilize",
-    "tranquillised": "tranquilized",
-    "tranquilliser": "tranquilizer",
-    "tranquillisers": "tranquilizers",
-    "tranquillises": "tranquilizes",
-    "tranquillising": "tranquilizing",
-    "tranquillity": "tranquility",
-    "tranquillize": "tranquilize",
-    "tranquillized": "tranquilized",
-    "tranquillizer": "tranquilizer",
-    "tranquillizers": "tranquilizers",
-    "tranquillizes": "tranquilizes",
-    "tranquillizing": "tranquilizing",
-    "tranquilly": "tranquility",
-    "transistorised": "transistorized",
-    "traumatise": "traumatize",
-    "traumatised": "traumatized",
-    "traumatises": "traumatizes",
-    "traumatising": "traumatizing",
-    "travelled": "traveled",
-    "traveller": "traveler",
-    "travellers": "travelers",
-    "travelling": "traveling",
-    "travelog": "travelogue",
-    "travelogs": "travelogues",
-    "trialled": "trialed",
-    "trialling": "trialing",
-    "tricolour": "tricolor",
-    "tricolours": "tricolors",
-    "trivialise": "trivialize",
-    "trivialised": "trivialized",
-    "trivialises": "trivializes",
-    "trivialising": "trivializing",
-    "tumour": "tumor",
-    "tumours": "tumors",
-    "tunnelled": "tunneled",
-    "tunnelling": "tunneling",
-    "tyrannise": "tyrannize",
-    "tyrannised": "tyrannized",
-    "tyrannises": "tyrannizes",
-    "tyrannising": "tyrannizing",
-    "tyre": "tire",
-    "tyres": "tires",
-    "unauthorised": "unauthorized",
-    "uncivilised": "uncivilized",
-    "underutilised": "underutilized",
-    "unequalled": "unequaled",
-    "unfavourable": "unfavorable",
-    "unfavourably": "unfavorably",
-    "unionisation": "unionization",
-    "unionise": "unionize",
-    "unionised": "unionized",
-    "unionises": "unionizes",
-    "unionising": "unionizing",
-    "unorganised": "unorganized",
-    "unravelled": "unraveled",
-    "unravelling": "unraveling",
-    "unrecognisable": "unrecognizable",
-    "unrecognised": "unrecognized",
-    "unrivalled": "unrivaled",
-    "unsavoury": "unsavory",
-    "untrammelled": "untrammeled",
-    "urbanisation": "urbanization",
-    "urbanise": "urbanize",
-    "urbanised": "urbanized",
-    "urbanises": "urbanizes",
-    "urbanising": "urbanizing",
-    "utilisable": "utilizable",
-    "utilisation": "utilization",
-    "utilise": "utilize",
-    "utilised": "utilized",
-    "utilises": "utilizes",
-    "utilising": "utilizing",
-    "valour": "valor",
-    "vandalise": "vandalize",
-    "vandalised": "vandalized",
-    "vandalises": "vandalizes",
-    "vandalising": "vandalizing",
-    "vaporisation": "vaporization",
-    "vaporise": "vaporize",
-    "vaporised": "vaporized",
-    "vaporises": "vaporizes",
-    "vaporising": "vaporizing",
-    "vapour": "vapor",
-    "vapours": "vapors",
-    "verbalise": "verbalize",
-    "verbalised": "verbalized",
-    "verbalises": "verbalizes",
-    "verbalising": "verbalizing",
-    "victimisation": "victimization",
-    "victimise": "victimize",
-    "victimised": "victimized",
-    "victimises": "victimizes",
-    "victimising": "victimizing",
-    "videodisc": "videodisk",
-    "videodiscs": "videodisks",
-    "vigour": "vigor",
-    "visualisation": "visualization",
-    "visualisations": "visualizations",
-    "visualise": "visualize",
-    "visualised": "visualized",
-    "visualises": "visualizes",
-    "visualising": "visualizing",
-    "vocalisation": "vocalization",
-    "vocalisations": "vocalizations",
-    "vocalise": "vocalize",
-    "vocalised": "vocalized",
-    "vocalises": "vocalizes",
-    "vocalising": "vocalizing",
-    "vulcanised": "vulcanized",
-    "vulgarisation": "vulgarization",
-    "vulgarise": "vulgarize",
-    "vulgarised": "vulgarized",
-    "vulgarises": "vulgarizes",
-    "vulgarising": "vulgarizing",
-    "waggon": "wagon",
-    "waggons": "wagons",
-    "watercolour": "watercolor",
-    "watercolours": "watercolors",
-    "weaselled": "weaseled",
-    "weaselling": "weaseling",
-    "westernisation": "westernization",
-    "westernise": "westernize",
-    "westernised": "westernized",
-    "westernises": "westernizes",
-    "westernising": "westernizing",
-    "womanise": "womanize",
-    "womanised": "womanized",
-    "womaniser": "womanizer",
-    "womanisers": "womanizers",
-    "womanises": "womanizes",
-    "womanising": "womanizing",
-    "woollen": "woolen",
-    "woollens": "woolens",
-    "woollies": "woolies",
-    "woolly": "wooly",
-    "worshipped": "worshiped",
-    "worshipping": "worshiping",
-    "worshipper": "worshiper",
-    "yodelled": "yodeled",
-    "yodelling": "yodeling",
-    "yoghourt": "yogurt",
-    "yoghourts": "yogurts",
-    "yoghurt": "yogurt",
-    "yoghurts": "yogurts",
-    "mhm": "hmm",
-    "mm": "hmm",
-    "mmm": "hmm"
-}
\ No newline at end of file
diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index fcf73c402284c..08cc247817584 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import os
 import re
 from fractions import Fraction
 from typing import Iterator, List, Match, Optional, Union
@@ -253,7 +251,7 @@ def output(result: Union[str, int]):
             if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
                 # arabic numbers (potentially with signs and fractions)
                 f = to_fraction(current_without_prefix)
-                if f is not None:
+                if f is None:
                     raise ValueError("Converting the fraction failed")
 
                 if value is not None:
@@ -502,6 +500,1750 @@ def __call__(self, s: str):
         return s
 
 
+ENGLISH_MAPPING = {
+    "accessorise": "accessorize",
+    "accessorised": "accessorized",
+    "accessorises": "accessorizes",
+    "accessorising": "accessorizing",
+    "acclimatisation": "acclimatization",
+    "acclimatise": "acclimatize",
+    "acclimatised": "acclimatized",
+    "acclimatises": "acclimatizes",
+    "acclimatising": "acclimatizing",
+    "accoutrements": "accouterments",
+    "aeon": "eon",
+    "aeons": "eons",
+    "aerogramme": "aerogram",
+    "aerogrammes": "aerograms",
+    "aeroplane": "airplane",
+    "aeroplanes": "airplanes",
+    "aesthete": "esthete",
+    "aesthetes": "esthetes",
+    "aesthetic": "esthetic",
+    "aesthetically": "esthetically",
+    "aesthetics": "esthetics",
+    "aetiology": "etiology",
+    "ageing": "aging",
+    "aggrandisement": "aggrandizement",
+    "agonise": "agonize",
+    "agonised": "agonized",
+    "agonises": "agonizes",
+    "agonising": "agonizing",
+    "agonisingly": "agonizingly",
+    "almanack": "almanac",
+    "almanacks": "almanacs",
+    "aluminium": "aluminum",
+    "amortisable": "amortizable",
+    "amortisation": "amortization",
+    "amortisations": "amortizations",
+    "amortise": "amortize",
+    "amortised": "amortized",
+    "amortises": "amortizes",
+    "amortising": "amortizing",
+    "amphitheatre": "amphitheater",
+    "amphitheatres": "amphitheaters",
+    "anaemia": "anemia",
+    "anaemic": "anemic",
+    "anaesthesia": "anesthesia",
+    "anaesthetic": "anesthetic",
+    "anaesthetics": "anesthetics",
+    "anaesthetise": "anesthetize",
+    "anaesthetised": "anesthetized",
+    "anaesthetises": "anesthetizes",
+    "anaesthetising": "anesthetizing",
+    "anaesthetist": "anesthetist",
+    "anaesthetists": "anesthetists",
+    "anaesthetize": "anesthetize",
+    "anaesthetized": "anesthetized",
+    "anaesthetizes": "anesthetizes",
+    "anaesthetizing": "anesthetizing",
+    "analogue": "analog",
+    "analogues": "analogs",
+    "analyse": "analyze",
+    "analysed": "analyzed",
+    "analyses": "analyzes",
+    "analysing": "analyzing",
+    "anglicise": "anglicize",
+    "anglicised": "anglicized",
+    "anglicises": "anglicizes",
+    "anglicising": "anglicizing",
+    "annualised": "annualized",
+    "antagonise": "antagonize",
+    "antagonised": "antagonized",
+    "antagonises": "antagonizes",
+    "antagonising": "antagonizing",
+    "apologise": "apologize",
+    "apologised": "apologized",
+    "apologises": "apologizes",
+    "apologising": "apologizing",
+    "appal": "appall",
+    "appals": "appalls",
+    "appetiser": "appetizer",
+    "appetisers": "appetizers",
+    "appetising": "appetizing",
+    "appetisingly": "appetizingly",
+    "arbour": "arbor",
+    "arbours": "arbors",
+    "archeological": "archaeological",
+    "archaeologically": "archeologically",
+    "archaeologist": "archeologist",
+    "archaeologists": "archeologists",
+    "archaeology": "archeology</span>",
+    "ardour": "ardor",
+    "armour": "armor",
+    "armoured": "armored",
+    "armourer": "armorer",
+    "armourers": "armorers",
+    "armouries": "armories",
+    "armoury": "armory",
+    "artefact": "artifact",
+    "artefacts": "artifacts",
+    "authorise": "authorize",
+    "authorised": "authorized",
+    "authorises": "authorizes",
+    "authorising": "authorizing",
+    "axe": "ax",
+    "backpedalled": "backpedaled",
+    "backpedalling": "backpedaling",
+    "bannister": "banister",
+    "bannisters": "banisters",
+    "baptise": "baptize",
+    "baptised": "baptized",
+    "baptises": "baptizes",
+    "baptising": "baptizing",
+    "bastardise": "bastardize",
+    "bastardised": "bastardized",
+    "bastardises": "bastardizes",
+    "bastardising": "bastardizing",
+    "battleax": "battleaxe",
+    "baulk": "balk",
+    "baulked": "balked",
+    "baulking": "balking",
+    "baulks": "balks",
+    "bedevilled": "bedeviled",
+    "bedevilling": "bedeviling",
+    "behaviour": "behavior",
+    "behavioural": "behavioral",
+    "behaviourism": "behaviorism",
+    "behaviourist": "behaviorist",
+    "behaviourists": "behaviorists",
+    "behaviours": "behaviors",
+    "behove": "behoove",
+    "behoved": "behooved",
+    "behoves": "behooves",
+    "bejewelled": "bejeweled",
+    "belabour": "belabor",
+    "belaboured": "belabored",
+    "belabouring": "belaboring",
+    "belabours": "belabors",
+    "bevelled": "beveled",
+    "bevvies": "bevies",
+    "bevvy": "bevy",
+    "biassed": "biased",
+    "biassing": "biasing",
+    "bingeing": "binging",
+    "bougainvillaea": "bougainvillea",
+    "bougainvillaeas": "bougainvilleas",
+    "bowdlerise": "bowdlerize",
+    "bowdlerised": "bowdlerized",
+    "bowdlerises": "bowdlerizes",
+    "bowdlerising": "bowdlerizing",
+    "breathalyse": "breathalyze",
+    "breathalysed": "breathalyzed",
+    "breathalyser": "breathalyzer",
+    "breathalysers": "breathalyzers",
+    "breathalyses": "breathalyzes",
+    "breathalysing": "breathalyzing",
+    "brutalise": "brutalize",
+    "brutalised": "brutalized",
+    "brutalises": "brutalizes",
+    "brutalising": "brutalizing",
+    "busses": "buses",
+    "bussing": "busing",
+    "caesarean": "cesarean",
+    "caesareans": "cesareans",
+    "calibre": "caliber",
+    "calibres": "calibers",
+    "calliper": "caliper",
+    "callipers": "calipers",
+    "callisthenics": "calisthenics",
+    "canalise": "canalize",
+    "canalised": "canalized",
+    "canalises": "canalizes",
+    "canalising": "canalizing",
+    "cancelation": "cancellation",
+    "cancelations": "cancellations",
+    "cancelled": "canceled",
+    "cancelling": "canceling",
+    "candour": "candor",
+    "cannibalise": "cannibalize",
+    "cannibalised": "cannibalized",
+    "cannibalises": "cannibalizes",
+    "cannibalising": "cannibalizing",
+    "canonise": "canonize",
+    "canonised": "canonized",
+    "canonises": "canonizes",
+    "canonising": "canonizing",
+    "capitalise": "capitalize",
+    "capitalised": "capitalized",
+    "capitalises": "capitalizes",
+    "capitalising": "capitalizing",
+    "caramelise": "caramelize",
+    "caramelised": "caramelized",
+    "caramelises": "caramelizes",
+    "caramelising": "caramelizing",
+    "carbonise": "carbonize",
+    "carbonised": "carbonized",
+    "carbonises": "carbonizes",
+    "carbonising": "carbonizing",
+    "carolled": "caroled",
+    "carolling": "caroling",
+    "catalogue": "catalog",
+    "catalogued": "cataloged",
+    "catalogues": "catalogs",
+    "cataloguing": "cataloging",
+    "catalyse": "catalyze",
+    "catalysed": "catalyzed",
+    "catalyses": "catalyzes",
+    "catalysing": "catalyzing",
+    "categorise": "categorize",
+    "categorised": "categorized",
+    "categorises": "categorizes",
+    "categorising": "categorizing",
+    "cauterise": "cauterize",
+    "cauterised": "cauterized",
+    "cauterises": "cauterizes",
+    "cauterising": "cauterizing",
+    "cavilled": "caviled",
+    "cavilling": "caviling",
+    "centigramme": "centigram",
+    "centigrammes": "centigrams",
+    "centilitre": "centiliter",
+    "centilitres": "centiliters",
+    "centimetre": "centimeter",
+    "centimetres": "centimeters",
+    "centralise": "centralize",
+    "centralised": "centralized",
+    "centralises": "centralizes",
+    "centralising": "centralizing",
+    "centre": "center",
+    "centred": "centered",
+    "centrefold": "centerfold",
+    "centrefolds": "centerfolds",
+    "centrepiece": "centerpiece",
+    "centrepieces": "centerpieces",
+    "centres": "centers",
+    "channelled": "channeled",
+    "channelling": "channeling",
+    "characterise": "characterize",
+    "characterised": "characterized",
+    "characterises": "characterizes",
+    "characterising": "characterizing",
+    "cheque": "check",
+    "chequebook": "checkbook",
+    "chequebooks": "checkbooks",
+    "chequered": "checkered",
+    "cheques": "checks",
+    "chilli": "chili",
+    "chimaera": "chimera",
+    "chimaeras": "chimeras",
+    "chiselled": "chiseled",
+    "chiselling": "chiseling",
+    "circularise": "circularize",
+    "circularised": "circularized",
+    "circularises": "circularizes",
+    "circularising": "circularizing",
+    "civilise": "civilize",
+    "civilised": "civilized",
+    "civilises": "civilizes",
+    "civilising": "civilizing",
+    "clamour": "clamor",
+    "clamoured": "clamored",
+    "clamouring": "clamoring",
+    "clamours": "clamors",
+    "clangour": "clangor",
+    "clarinettist": "clarinetist",
+    "clarinettists": "clarinetists",
+    "collectivise": "collectivize",
+    "collectivised": "collectivized",
+    "collectivises": "collectivizes",
+    "collectivising": "collectivizing",
+    "colonisation": "colonization",
+    "colonise": "colonize",
+    "colonised": "colonized",
+    "coloniser": "colonizer",
+    "colonisers": "colonizers",
+    "colonises": "colonizes",
+    "colonising": "colonizing",
+    "colour": "color",
+    "colourant": "colorant",
+    "colourants": "colorants",
+    "coloured": "colored",
+    "coloureds": "coloreds",
+    "colourful": "colorful",
+    "colourfully": "colorfully",
+    "colouring": "coloring",
+    "colourize": "colorize",
+    "colourized": "colorized",
+    "colourizes": "colorizes",
+    "colourizing": "colorizing",
+    "colourless": "colorless",
+    "colours": "colors",
+    "commercialise": "commercialize",
+    "commercialised": "commercialized",
+    "commercialises": "commercializes",
+    "commercialising": "commercializing",
+    "compartmentalise": "compartmentalize",
+    "compartmentalised": "compartmentalized",
+    "compartmentalises": "compartmentalizes",
+    "compartmentalising": "compartmentalizing",
+    "computerise": "computerize",
+    "computerised": "computerized",
+    "computerises": "computerizes",
+    "computerising": "computerizing",
+    "conceptualise": "conceptualize",
+    "conceptualised": "conceptualized",
+    "conceptualises": "conceptualizes",
+    "conceptualising": "conceptualizing",
+    "connexion": "connection",
+    "connexions": "connections",
+    "contextualise": "contextualize",
+    "contextualised": "contextualized",
+    "contextualises": "contextualizes",
+    "contextualising": "contextualizing",
+    "cosier": "cozier",
+    "cosies": "cozies",
+    "cosiest": "coziest",
+    "cosily": "cozily",
+    "cosiness": "coziness",
+    "cosy": "cozy",
+    "councillor": "councilor",
+    "councillors": "councilors",
+    "counselled": "counseled",
+    "counselling": "counseling",
+    "counsellor": "counselor",
+    "counsellors": "counselors",
+    "crenelated": "crenellated",
+    "criminalise": "criminalize",
+    "criminalised": "criminalized",
+    "criminalises": "criminalizes",
+    "criminalising": "criminalizing",
+    "criticise": "criticize",
+    "criticised": "criticized",
+    "criticises": "criticizes",
+    "criticising": "criticizing",
+    "crueller": "crueler",
+    "cruellest": "cruelest",
+    "crystallisation": "crystallization",
+    "crystallise": "crystallize",
+    "crystallised": "crystallized",
+    "crystallises": "crystallizes",
+    "crystallising": "crystallizing",
+    "cudgelled": "cudgeled",
+    "cudgelling": "cudgeling",
+    "customise": "customize",
+    "customised": "customized",
+    "customises": "customizes",
+    "customising": "customizing",
+    "cypher": "cipher",
+    "cyphers": "ciphers",
+    "decentralisation": "decentralization",
+    "decentralise": "decentralize",
+    "decentralised": "decentralized",
+    "decentralises": "decentralizes",
+    "decentralising": "decentralizing",
+    "decriminalisation": "decriminalization",
+    "decriminalise": "decriminalize",
+    "decriminalised": "decriminalized",
+    "decriminalises": "decriminalizes",
+    "decriminalising": "decriminalizing",
+    "defence": "defense",
+    "defenceless": "defenseless",
+    "defences": "defenses",
+    "dehumanisation": "dehumanization",
+    "dehumanise": "dehumanize",
+    "dehumanised": "dehumanized",
+    "dehumanises": "dehumanizes",
+    "dehumanising": "dehumanizing",
+    "demeanour": "demeanor",
+    "demilitarisation": "demilitarization",
+    "demilitarise": "demilitarize",
+    "demilitarised": "demilitarized",
+    "demilitarises": "demilitarizes",
+    "demilitarising": "demilitarizing",
+    "demobilisation": "demobilization",
+    "demobilise": "demobilize",
+    "demobilised": "demobilized",
+    "demobilises": "demobilizes",
+    "demobilising": "demobilizing",
+    "democratisation": "democratization",
+    "democratise": "democratize",
+    "democratised": "democratized",
+    "democratises": "democratizes",
+    "democratising": "democratizing",
+    "demonise": "demonize",
+    "demonised": "demonized",
+    "demonises": "demonizes",
+    "demonising": "demonizing",
+    "demoralisation": "demoralization",
+    "demoralise": "demoralize",
+    "demoralised": "demoralized",
+    "demoralises": "demoralizes",
+    "demoralising": "demoralizing",
+    "denationalisation": "denationalization",
+    "denationalise": "denationalize",
+    "denationalised": "denationalized",
+    "denationalises": "denationalizes",
+    "denationalising": "denationalizing",
+    "deodorise": "deodorize",
+    "deodorised": "deodorized",
+    "deodorises": "deodorizes",
+    "deodorising": "deodorizing",
+    "depersonalise": "depersonalize",
+    "depersonalised": "depersonalized",
+    "depersonalises": "depersonalizes",
+    "depersonalising": "depersonalizing",
+    "deputise": "deputize",
+    "deputised": "deputized",
+    "deputises": "deputizes",
+    "deputising": "deputizing",
+    "desensitisation": "desensitization",
+    "desensitise": "desensitize",
+    "desensitised": "desensitized",
+    "desensitises": "desensitizes",
+    "desensitising": "desensitizing",
+    "destabilisation": "destabilization",
+    "destabilise": "destabilize",
+    "destabilised": "destabilized",
+    "destabilises": "destabilizes",
+    "destabilising": "destabilizing",
+    "dialled": "dialed",
+    "dialling": "dialing",
+    "dialogue": "dialog",
+    "dialogues": "dialogs",
+    "diarrhoea": "diarrhea",
+    "digitise": "digitize",
+    "digitised": "digitized",
+    "digitises": "digitizes",
+    "digitising": "digitizing",
+    "disc": "disk",
+    "discolour": "discolor",
+    "discoloured": "discolored",
+    "discolouring": "discoloring",
+    "discolours": "discolors",
+    "discs": "disks",
+    "disembowelled": "disemboweled",
+    "disembowelling": "disemboweling",
+    "disfavour": "disfavor",
+    "dishevelled": "disheveled",
+    "dishonour": "dishonor",
+    "dishonourable": "dishonorable",
+    "dishonourably": "dishonorably",
+    "dishonoured": "dishonored",
+    "dishonouring": "dishonoring",
+    "dishonours": "dishonors",
+    "disorganisation": "disorganization",
+    "disorganised": "disorganized",
+    "distil": "distill",
+    "distils": "distills",
+    "dramatisation": "dramatization",
+    "dramatisations": "dramatizations",
+    "dramatise": "dramatize",
+    "dramatised": "dramatized",
+    "dramatises": "dramatizes",
+    "dramatising": "dramatizing",
+    "draught": "draft",
+    "draughtboard": "draftboard",
+    "draughtboards": "draftboards",
+    "draughtier": "draftier",
+    "draughtiest": "draftiest",
+    "draughts": "drafts",
+    "draughtsman": "draftsman",
+    "draughtsmanship": "draftsmanship",
+    "draughtsmen": "draftsmen",
+    "draughtswoman": "draftswoman",
+    "draughtswomen": "draftswomen",
+    "draughty": "drafty",
+    "drivelled": "driveled",
+    "drivelling": "driveling",
+    "duelled": "dueled",
+    "duelling": "dueling",
+    "economise": "economize",
+    "economised": "economized",
+    "economises": "economizes",
+    "economising": "economizing",
+    "edoema": "edema",
+    "editorialise": "editorialize",
+    "editorialised": "editorialized",
+    "editorialises": "editorializes",
+    "editorialising": "editorializing",
+    "empathise": "empathize",
+    "empathised": "empathized",
+    "empathises": "empathizes",
+    "empathising": "empathizing",
+    "emphasise": "emphasize",
+    "emphasised": "emphasized",
+    "emphasises": "emphasizes",
+    "emphasising": "emphasizing",
+    "enamelled": "enameled",
+    "enamelling": "enameling",
+    "enamoured": "enamored",
+    "encyclopaedia": "encyclopedia",
+    "encyclopaedias": "encyclopedias",
+    "encyclopaedic": "encyclopedic",
+    "endeavour": "endeavor",
+    "endeavoured": "endeavored",
+    "endeavouring": "endeavoring",
+    "endeavours": "endeavors",
+    "energise": "energize",
+    "energised": "energized",
+    "energises": "energizes",
+    "energising": "energizing",
+    "enrol": "enroll",
+    "enrols": "enrolls",
+    "enthral": "enthrall",
+    "enthrals": "enthralls",
+    "epaulette": "epaulet",
+    "epaulettes": "epaulets",
+    "epicentre": "epicenter",
+    "epicentres": "epicenters",
+    "epilogue": "epilog",
+    "epilogues": "epilogs",
+    "epitomise": "epitomize",
+    "epitomised": "epitomized",
+    "epitomises": "epitomizes",
+    "epitomising": "epitomizing",
+    "equalisation": "equalization",
+    "equalise": "equalize",
+    "equalised": "equalized",
+    "equaliser": "equalizer",
+    "equalisers": "equalizers",
+    "equalises": "equalizes",
+    "equalising": "equalizing",
+    "eulogise": "eulogize",
+    "eulogised": "eulogized",
+    "eulogises": "eulogizes",
+    "eulogising": "eulogizing",
+    "evangelise": "evangelize",
+    "evangelised": "evangelized",
+    "evangelises": "evangelizes",
+    "evangelising": "evangelizing",
+    "exorcise": "exorcize",
+    "exorcised": "exorcized",
+    "exorcises": "exorcizes",
+    "exorcising": "exorcizing",
+    "extemporisation": "extemporization",
+    "extemporise": "extemporize",
+    "extemporised": "extemporized",
+    "extemporises": "extemporizes",
+    "extemporising": "extemporizing",
+    "externalisation": "externalization",
+    "externalisations": "externalizations",
+    "externalise": "externalize",
+    "externalised": "externalized",
+    "externalises": "externalizes",
+    "externalising": "externalizing",
+    "factorise": "factorize",
+    "factorised": "factorized",
+    "factorises": "factorizes",
+    "factorising": "factorizing",
+    "faecal": "fecal",
+    "faeces": "feces",
+    "familiarisation": "familiarization",
+    "familiarise": "familiarize",
+    "familiarised": "familiarized",
+    "familiarises": "familiarizes",
+    "familiarising": "familiarizing",
+    "fantasise": "fantasize",
+    "fantasised": "fantasized",
+    "fantasises": "fantasizes",
+    "fantasising": "fantasizing",
+    "favour": "favor",
+    "favourable": "favorable",
+    "favourably": "favorably",
+    "favoured": "favored",
+    "favouring": "favoring",
+    "favourite": "favorite",
+    "favourites": "favorites",
+    "favouritism": "favoritism",
+    "favours": "favors",
+    "feminise": "feminize",
+    "feminised": "feminized",
+    "feminises": "feminizes",
+    "feminising": "feminizing",
+    "fertilisation": "fertilization",
+    "fertilise": "fertilize",
+    "fertilised": "fertilized",
+    "fertiliser": "fertilizer",
+    "fertilisers": "fertilizers",
+    "fertilises": "fertilizes",
+    "fertilising": "fertilizing",
+    "fervour": "fervor",
+    "fibre": "fiber",
+    "fibreglass": "fiberglass",
+    "fibres": "fibers",
+    "fictionalisation": "fictionalization",
+    "fictionalisations": "fictionalizations",
+    "fictionalise": "fictionalize",
+    "fictionalised": "fictionalized",
+    "fictionalises": "fictionalizes",
+    "fictionalising": "fictionalizing",
+    "fillet": "filet",
+    "filleted": "fileted",
+    "filleting": "fileting",
+    "fillets": "filets",
+    "finalisation": "finalization",
+    "finalise": "finalize",
+    "finalised": "finalized",
+    "finalises": "finalizes",
+    "finalising": "finalizing",
+    "flautist": "flutist",
+    "flautists": "flutists",
+    "flavour": "flavor",
+    "flavoured": "flavored",
+    "flavouring": "flavoring",
+    "flavourings": "flavorings",
+    "flavourless": "flavorless",
+    "flavours": "flavors",
+    "flavoursome": "flavorsome",
+    "flyer / flier": "flier / flyer",
+    "foetal": "fetal",
+    "foetid": "fetid",
+    "foetus": "fetus",
+    "foetuses": "fetuses",
+    "formalisation": "formalization",
+    "formalise": "formalize",
+    "formalised": "formalized",
+    "formalises": "formalizes",
+    "formalising": "formalizing",
+    "fossilisation": "fossilization",
+    "fossilise": "fossilize",
+    "fossilised": "fossilized",
+    "fossilises": "fossilizes",
+    "fossilising": "fossilizing",
+    "fraternisation": "fraternization",
+    "fraternise": "fraternize",
+    "fraternised": "fraternized",
+    "fraternises": "fraternizes",
+    "fraternising": "fraternizing",
+    "fulfil": "fulfill",
+    "fulfilment": "fulfillment",
+    "fulfils": "fulfills",
+    "funnelled": "funneled",
+    "funnelling": "funneling",
+    "galvanise": "galvanize",
+    "galvanised": "galvanized",
+    "galvanises": "galvanizes",
+    "galvanising": "galvanizing",
+    "gambolled": "gamboled",
+    "gambolling": "gamboling",
+    "gaol": "jail",
+    "gaolbird": "jailbird",
+    "gaolbirds": "jailbirds",
+    "gaolbreak": "jailbreak",
+    "gaolbreaks": "jailbreaks",
+    "gaoled": "jailed",
+    "gaoler": "jailer",
+    "gaolers": "jailers",
+    "gaoling": "jailing",
+    "gaols": "jails",
+    "gasses": "gases",
+    "gage": "gauge",
+    "gaged": "gauged",
+    "gages": "gauges",
+    "gaging": "gauging",
+    "generalisation": "generalization",
+    "generalisations": "generalizations",
+    "generalise": "generalize",
+    "generalised": "generalized",
+    "generalises": "generalizes",
+    "generalising": "generalizing",
+    "ghettoise": "ghettoize",
+    "ghettoised": "ghettoized",
+    "ghettoises": "ghettoizes",
+    "ghettoising": "ghettoizing",
+    "gipsies": "gypsies",
+    "glamorise": "glamorize",
+    "glamorised": "glamorized",
+    "glamorises": "glamorizes",
+    "glamorising": "glamorizing",
+    "glamor": "glamour",
+    "globalisation": "globalization",
+    "globalise": "globalize",
+    "globalised": "globalized",
+    "globalises": "globalizes",
+    "globalising": "globalizing",
+    "glueing": "gluing",
+    "goitre": "goiter",
+    "goitres": "goiters",
+    "gonorrhoea": "gonorrhea",
+    "gramme": "gram",
+    "grammes": "grams",
+    "gravelled": "graveled",
+    "grey": "gray",
+    "greyed": "grayed",
+    "greying": "graying",
+    "greyish": "grayish",
+    "greyness": "grayness",
+    "greys": "grays",
+    "grovelled": "groveled",
+    "grovelling": "groveling",
+    "groyne": "groin",
+    "groynes": "groins",
+    "gruelling": "grueling",
+    "gruellingly": "gruelingly",
+    "gryphon": "griffin",
+    "gryphons": "griffins",
+    "gynaecological": "gynecological",
+    "gynaecologist": "gynecologist",
+    "gynaecologists": "gynecologists",
+    "gynaecology": "gynecology",
+    "haematological": "hematological",
+    "haematologist": "hematologist",
+    "haematologists": "hematologists",
+    "haematology": "hematology",
+    "haemoglobin": "hemoglobin",
+    "haemophilia": "hemophilia",
+    "haemophiliac": "hemophiliac",
+    "haemophiliacs": "hemophiliacs",
+    "haemorrhage": "hemorrhage",
+    "haemorrhaged": "hemorrhaged",
+    "haemorrhages": "hemorrhages",
+    "haemorrhaging": "hemorrhaging",
+    "haemorrhoids": "hemorrhoids",
+    "harbour": "harbor",
+    "harboured": "harbored",
+    "harbouring": "harboring",
+    "harbours": "harbors",
+    "harmonisation": "harmonization",
+    "harmonise": "harmonize",
+    "harmonised": "harmonized",
+    "harmonises": "harmonizes",
+    "harmonising": "harmonizing",
+    "homoeopath": "homeopath",
+    "homoeopathic": "homeopathic",
+    "homoeopaths": "homeopaths",
+    "homoeopathy": "homeopathy",
+    "homogenise": "homogenize",
+    "homogenised": "homogenized",
+    "homogenises": "homogenizes",
+    "homogenising": "homogenizing",
+    "honour": "honor",
+    "honourable": "honorable",
+    "honourably": "honorably",
+    "honoured": "honored",
+    "honouring": "honoring",
+    "honours": "honors",
+    "hospitalisation": "hospitalization",
+    "hospitalise": "hospitalize",
+    "hospitalised": "hospitalized",
+    "hospitalises": "hospitalizes",
+    "hospitalising": "hospitalizing",
+    "humanise": "humanize",
+    "humanised": "humanized",
+    "humanises": "humanizes",
+    "humanising": "humanizing",
+    "humour": "humor",
+    "humoured": "humored",
+    "humouring": "humoring",
+    "humourless": "humorless",
+    "humours": "humors",
+    "hybridise": "hybridize",
+    "hybridised": "hybridized",
+    "hybridises": "hybridizes",
+    "hybridising": "hybridizing",
+    "hypnotise": "hypnotize",
+    "hypnotised": "hypnotized",
+    "hypnotises": "hypnotizes",
+    "hypnotising": "hypnotizing",
+    "hypothesise": "hypothesize",
+    "hypothesised": "hypothesized",
+    "hypothesises": "hypothesizes",
+    "hypothesising": "hypothesizing",
+    "idealisation": "idealization",
+    "idealise": "idealize",
+    "idealised": "idealized",
+    "idealises": "idealizes",
+    "idealising": "idealizing",
+    "idolise": "idolize",
+    "idolised": "idolized",
+    "idolises": "idolizes",
+    "idolising": "idolizing",
+    "immobilisation": "immobilization",
+    "immobilise": "immobilize",
+    "immobilised": "immobilized",
+    "immobiliser": "immobilizer",
+    "immobilisers": "immobilizers",
+    "immobilises": "immobilizes",
+    "immobilising": "immobilizing",
+    "immortalise": "immortalize",
+    "immortalised": "immortalized",
+    "immortalises": "immortalizes",
+    "immortalising": "immortalizing",
+    "immunisation": "immunization",
+    "immunise": "immunize",
+    "immunised": "immunized",
+    "immunises": "immunizes",
+    "immunising": "immunizing",
+    "impanelled": "impaneled",
+    "impanelling": "impaneling",
+    "imperilled": "imperiled",
+    "imperilling": "imperiling",
+    "individualise": "individualize",
+    "individualised": "individualized",
+    "individualises": "individualizes",
+    "individualising": "individualizing",
+    "industrialise": "industrialize",
+    "industrialised": "industrialized",
+    "industrialises": "industrializes",
+    "industrialising": "industrializing",
+    "inflexion": "inflection",
+    "inflexions": "inflections",
+    "initialise": "initialize",
+    "initialised": "initialized",
+    "initialises": "initializes",
+    "initialising": "initializing",
+    "initialled": "initialed",
+    "initialling": "initialing",
+    "instal": "install",
+    "instalment": "installment",
+    "instalments": "installments",
+    "instals": "installs",
+    "instil": "instill",
+    "instils": "instills",
+    "institutionalisation": "institutionalization",
+    "institutionalise": "institutionalize",
+    "institutionalised": "institutionalized",
+    "institutionalises": "institutionalizes",
+    "institutionalising": "institutionalizing",
+    "intellectualise": "intellectualize",
+    "intellectualised": "intellectualized",
+    "intellectualises": "intellectualizes",
+    "intellectualising": "intellectualizing",
+    "internalisation": "internalization",
+    "internalise": "internalize",
+    "internalised": "internalized",
+    "internalises": "internalizes",
+    "internalising": "internalizing",
+    "internationalisation": "internationalization",
+    "internationalise": "internationalize",
+    "internationalised": "internationalized",
+    "internationalises": "internationalizes",
+    "internationalising": "internationalizing",
+    "ionisation": "ionization",
+    "ionise": "ionize",
+    "ionised": "ionized",
+    "ioniser": "ionizer",
+    "ionisers": "ionizers",
+    "ionises": "ionizes",
+    "ionising": "ionizing",
+    "italicise": "italicize",
+    "italicised": "italicized",
+    "italicises": "italicizes",
+    "italicising": "italicizing",
+    "itemise": "itemize",
+    "itemised": "itemized",
+    "itemises": "itemizes",
+    "itemising": "itemizing",
+    "jeopardise": "jeopardize",
+    "jeopardised": "jeopardized",
+    "jeopardises": "jeopardizes",
+    "jeopardising": "jeopardizing",
+    "jewelled": "jeweled",
+    "jeweller": "jeweler",
+    "jewellers": "jewelers",
+    "jewellery": "jewelry",
+    "judgement": "judgment",
+    "kilogramme": "kilogram",
+    "kilogrammes": "kilograms",
+    "kilometre": "kilometer",
+    "kilometres": "kilometers",
+    "labelled": "labeled",
+    "labelling": "labeling",
+    "labour": "labor",
+    "laboured": "labored",
+    "labourer": "laborer",
+    "labourers": "laborers",
+    "labouring": "laboring",
+    "labours": "labors",
+    "lacklustre": "lackluster",
+    "legalisation": "legalization",
+    "legalise": "legalize",
+    "legalised": "legalized",
+    "legalises": "legalizes",
+    "legalising": "legalizing",
+    "legitimise": "legitimize",
+    "legitimised": "legitimized",
+    "legitimises": "legitimizes",
+    "legitimising": "legitimizing",
+    "leukaemia": "leukemia",
+    "levelled": "leveled",
+    "leveller": "leveler",
+    "levellers": "levelers",
+    "levelling": "leveling",
+    "libelled": "libeled",
+    "libelling": "libeling",
+    "libellous": "libelous",
+    "liberalisation": "liberalization",
+    "liberalise": "liberalize",
+    "liberalised": "liberalized",
+    "liberalises": "liberalizes",
+    "liberalising": "liberalizing",
+    "licence": "license",
+    "licenced": "licensed",
+    "licences": "licenses",
+    "licencing": "licensing",
+    "likeable": "likable",
+    "lionisation": "lionization",
+    "lionise": "lionize",
+    "lionised": "lionized",
+    "lionises": "lionizes",
+    "lionising": "lionizing",
+    "liquidise": "liquidize",
+    "liquidised": "liquidized",
+    "liquidiser": "liquidizer",
+    "liquidisers": "liquidizers",
+    "liquidises": "liquidizes",
+    "liquidising": "liquidizing",
+    "litre": "liter",
+    "litres": "liters",
+    "localise": "localize",
+    "localised": "localized",
+    "localises": "localizes",
+    "localising": "localizing",
+    "louvre": "louver",
+    "louvred": "louvered",
+    "louvres": "louvers",
+    "lustre": "luster",
+    "magnetise": "magnetize",
+    "magnetised": "magnetized",
+    "magnetises": "magnetizes",
+    "magnetising": "magnetizing",
+    "manoeuvrability": "maneuverability",
+    "manoeuvrable": "maneuverable",
+    "manoeuvre": "maneuver",
+    "manoeuvred": "maneuvered",
+    "manoeuvres": "maneuvers",
+    "manoeuvring": "maneuvering",
+    "manoeuvrings": "maneuverings",
+    "marginalisation": "marginalization",
+    "marginalise": "marginalize",
+    "marginalised": "marginalized",
+    "marginalises": "marginalizes",
+    "marginalising": "marginalizing",
+    "marshalled": "marshaled",
+    "marshalling": "marshaling",
+    "marvelled": "marveled",
+    "marvelling": "marveling",
+    "marvellous": "marvelous",
+    "marvellously": "marvelously",
+    "materialisation": "materialization",
+    "materialise": "materialize",
+    "materialised": "materialized",
+    "materialises": "materializes",
+    "materialising": "materializing",
+    "maximisation": "maximization",
+    "maximise": "maximize",
+    "maximised": "maximized",
+    "maximises": "maximizes",
+    "maximising": "maximizing",
+    "meagre": "meager",
+    "mechanisation": "mechanization",
+    "mechanise": "mechanize",
+    "mechanised": "mechanized",
+    "mechanises": "mechanizes",
+    "mechanising": "mechanizing",
+    "mediaeval": "medieval",
+    "memorialise": "memorialize",
+    "memorialised": "memorialized",
+    "memorialises": "memorializes",
+    "memorialising": "memorializing",
+    "memorise": "memorize",
+    "memorised": "memorized",
+    "memorises": "memorizes",
+    "memorising": "memorizing",
+    "mesmerise": "mesmerize",
+    "mesmerised": "mesmerized",
+    "mesmerises": "mesmerizes",
+    "mesmerising": "mesmerizing",
+    "metabolise": "metabolize",
+    "metabolised": "metabolized",
+    "metabolises": "metabolizes",
+    "metabolising": "metabolizing",
+    "metre": "meter",
+    "metres": "meters",
+    "micrometre": "micrometer",
+    "micrometres": "micrometers",
+    "militarise": "militarize",
+    "militarised": "militarized",
+    "militarises": "militarizes",
+    "militarising": "militarizing",
+    "milligramme": "milligram",
+    "milligrammes": "milligrams",
+    "millilitre": "milliliter",
+    "millilitres": "milliliters",
+    "millimetre": "millimeter",
+    "millimetres": "millimeters",
+    "miniaturisation": "miniaturization",
+    "miniaturise": "miniaturize",
+    "miniaturised": "miniaturized",
+    "miniaturises": "miniaturizes",
+    "miniaturising": "miniaturizing",
+    "minibusses": "minibuses",
+    "minimise": "minimize",
+    "minimised": "minimized",
+    "minimises": "minimizes",
+    "minimising": "minimizing",
+    "misbehaviour": "misbehavior",
+    "misdemeanour": "misdemeanor",
+    "misdemeanours": "misdemeanors",
+    "misspelt": "misspelled",
+    "mitre": "miter",
+    "mitres": "miters",
+    "mobilisation": "mobilization",
+    "mobilise": "mobilize",
+    "mobilised": "mobilized",
+    "mobilises": "mobilizes",
+    "mobilising": "mobilizing",
+    "modelled": "modeled",
+    "modeller": "modeler",
+    "modellers": "modelers",
+    "modelling": "modeling",
+    "modernise": "modernize",
+    "modernised": "modernized",
+    "modernises": "modernizes",
+    "modernising": "modernizing",
+    "moisturise": "moisturize",
+    "moisturised": "moisturized",
+    "moisturiser": "moisturizer",
+    "moisturisers": "moisturizers",
+    "moisturises": "moisturizes",
+    "moisturising": "moisturizing",
+    "monologue": "monolog",
+    "monologues": "monologs",
+    "monopolisation": "monopolization",
+    "monopolise": "monopolize",
+    "monopolised": "monopolized",
+    "monopolises": "monopolizes",
+    "monopolising": "monopolizing",
+    "moralise": "moralize",
+    "moralised": "moralized",
+    "moralises": "moralizes",
+    "moralising": "moralizing",
+    "motorised": "motorized",
+    "mould": "mold",
+    "moulded": "molded",
+    "moulder": "molder",
+    "mouldered": "moldered",
+    "mouldering": "moldering",
+    "moulders": "molders",
+    "mouldier": "moldier",
+    "mouldiest": "moldiest",
+    "moulding": "molding",
+    "mouldings": "moldings",
+    "moulds": "molds",
+    "mouldy": "moldy",
+    "moult": "molt",
+    "moulted": "molted",
+    "moulting": "molting",
+    "moults": "molts",
+    "moustache": "mustache",
+    "moustached": "mustached",
+    "moustaches": "mustaches",
+    "moustachioed": "mustachioed",
+    "multicoloured": "multicolored",
+    "nationalisation": "nationalization",
+    "nationalisations": "nationalizations",
+    "nationalise": "nationalize",
+    "nationalised": "nationalized",
+    "nationalises": "nationalizes",
+    "nationalising": "nationalizing",
+    "naturalisation": "naturalization",
+    "naturalise": "naturalize",
+    "naturalised": "naturalized",
+    "naturalises": "naturalizes",
+    "naturalising": "naturalizing",
+    "neighbour": "neighbor",
+    "neighbourhood": "neighborhood",
+    "neighbourhoods": "neighborhoods",
+    "neighbouring": "neighboring",
+    "neighbourliness": "neighborliness",
+    "neighbourly": "neighborly",
+    "neighbours": "neighbors",
+    "neutralisation": "neutralization",
+    "neutralise": "neutralize",
+    "neutralised": "neutralized",
+    "neutralises": "neutralizes",
+    "neutralising": "neutralizing",
+    "normalisation": "normalization",
+    "normalise": "normalize",
+    "normalised": "normalized",
+    "normalises": "normalizes",
+    "normalising": "normalizing",
+    "odour": "odor",
+    "odourless": "odorless",
+    "odours": "odors",
+    "oesophagus": "esophagus",
+    "oesophaguses": "esophaguses",
+    "oestrogen": "estrogen",
+    "offence": "offense",
+    "offences": "offenses",
+    "omelette": "omelet",
+    "omelettes": "omelets",
+    "optimise": "optimize",
+    "optimised": "optimized",
+    "optimises": "optimizes",
+    "optimising": "optimizing",
+    "organisation": "organization",
+    "organisational": "organizational",
+    "organisations": "organizations",
+    "organise": "organize",
+    "organised": "organized",
+    "organiser": "organizer",
+    "organisers": "organizers",
+    "organises": "organizes",
+    "organising": "organizing",
+    "orthopaedic": "orthopedic",
+    "orthopaedics": "orthopedics",
+    "ostracise": "ostracize",
+    "ostracised": "ostracized",
+    "ostracises": "ostracizes",
+    "ostracising": "ostracizing",
+    "outmanoeuvre": "outmaneuver",
+    "outmanoeuvred": "outmaneuvered",
+    "outmanoeuvres": "outmaneuvers",
+    "outmanoeuvring": "outmaneuvering",
+    "overemphasise": "overemphasize",
+    "overemphasised": "overemphasized",
+    "overemphasises": "overemphasizes",
+    "overemphasising": "overemphasizing",
+    "oxidisation": "oxidization",
+    "oxidise": "oxidize",
+    "oxidised": "oxidized",
+    "oxidises": "oxidizes",
+    "oxidising": "oxidizing",
+    "paederast": "pederast",
+    "paederasts": "pederasts",
+    "paediatric": "pediatric",
+    "paediatrician": "pediatrician",
+    "paediatricians": "pediatricians",
+    "paediatrics": "pediatrics",
+    "paedophile": "pedophile",
+    "paedophiles": "pedophiles",
+    "paedophilia": "pedophilia",
+    "palaeolithic": "paleolithic",
+    "palaeontologist": "paleontologist",
+    "palaeontologists": "paleontologists",
+    "palaeontology": "paleontology",
+    "panelled": "paneled",
+    "panelling": "paneling",
+    "panellist": "panelist",
+    "panellists": "panelists",
+    "paralyse": "paralyze",
+    "paralysed": "paralyzed",
+    "paralyses": "paralyzes",
+    "paralysing": "paralyzing",
+    "parcelled": "parceled",
+    "parcelling": "parceling",
+    "parlour": "parlor",
+    "parlours": "parlors",
+    "particularise": "particularize",
+    "particularised": "particularized",
+    "particularises": "particularizes",
+    "particularising": "particularizing",
+    "passivisation": "passivization",
+    "passivise": "passivize",
+    "passivised": "passivized",
+    "passivises": "passivizes",
+    "passivising": "passivizing",
+    "pasteurisation": "pasteurization",
+    "pasteurise": "pasteurize",
+    "pasteurised": "pasteurized",
+    "pasteurises": "pasteurizes",
+    "pasteurising": "pasteurizing",
+    "patronise": "patronize",
+    "patronised": "patronized",
+    "patronises": "patronizes",
+    "patronising": "patronizing",
+    "patronisingly": "patronizingly",
+    "pedalled": "pedaled",
+    "pedalling": "pedaling",
+    "pedestrianisation": "pedestrianization",
+    "pedestrianise": "pedestrianize",
+    "pedestrianised": "pedestrianized",
+    "pedestrianises": "pedestrianizes",
+    "pedestrianising": "pedestrianizing",
+    "penalise": "penalize",
+    "penalised": "penalized",
+    "penalises": "penalizes",
+    "penalising": "penalizing",
+    "pencilled": "penciled",
+    "pencilling": "penciling",
+    "personalise": "personalize",
+    "personalised": "personalized",
+    "personalises": "personalizes",
+    "personalising": "personalizing",
+    "pharmacopoeia": "pharmacopeia",
+    "pharmacopoeias": "pharmacopeias",
+    "philosophise": "philosophize",
+    "philosophised": "philosophized",
+    "philosophises": "philosophizes",
+    "philosophising": "philosophizing",
+    "philtre": "filter",
+    "philtres": "filters",
+    "phoney": "phony",
+    "plagiarise": "plagiarize",
+    "plagiarised": "plagiarized",
+    "plagiarises": "plagiarizes",
+    "plagiarising": "plagiarizing",
+    "plough": "plow",
+    "ploughed": "plowed",
+    "ploughing": "plowing",
+    "ploughman": "plowman",
+    "ploughmen": "plowmen",
+    "ploughs": "plows",
+    "ploughshare": "plowshare",
+    "ploughshares": "plowshares",
+    "polarisation": "polarization",
+    "polarise": "polarize",
+    "polarised": "polarized",
+    "polarises": "polarizes",
+    "polarising": "polarizing",
+    "politicisation": "politicization",
+    "politicise": "politicize",
+    "politicised": "politicized",
+    "politicises": "politicizes",
+    "politicising": "politicizing",
+    "popularisation": "popularization",
+    "popularise": "popularize",
+    "popularised": "popularized",
+    "popularises": "popularizes",
+    "popularising": "popularizing",
+    "pouffe": "pouf",
+    "pouffes": "poufs",
+    "practise": "practice",
+    "practised": "practiced",
+    "practises": "practices",
+    "practising": "practicing",
+    "praesidium": "presidium",
+    "praesidiums": "presidiums",
+    "pressurisation": "pressurization",
+    "pressurise": "pressurize",
+    "pressurised": "pressurized",
+    "pressurises": "pressurizes",
+    "pressurising": "pressurizing",
+    "pretence": "pretense",
+    "pretences": "pretenses",
+    "primaeval": "primeval",
+    "prioritisation": "prioritization",
+    "prioritise": "prioritize",
+    "prioritised": "prioritized",
+    "prioritises": "prioritizes",
+    "prioritising": "prioritizing",
+    "privatisation": "privatization",
+    "privatisations": "privatizations",
+    "privatise": "privatize",
+    "privatised": "privatized",
+    "privatises": "privatizes",
+    "privatising": "privatizing",
+    "professionalisation": "professionalization",
+    "professionalise": "professionalize",
+    "professionalised": "professionalized",
+    "professionalises": "professionalizes",
+    "professionalising": "professionalizing",
+    "programme": "program",
+    "programmes": "programs",
+    "prologue": "prolog",
+    "prologues": "prologs",
+    "propagandise": "propagandize",
+    "propagandised": "propagandized",
+    "propagandises": "propagandizes",
+    "propagandising": "propagandizing",
+    "proselytise": "proselytize",
+    "proselytised": "proselytized",
+    "proselytiser": "proselytizer",
+    "proselytisers": "proselytizers",
+    "proselytises": "proselytizes",
+    "proselytising": "proselytizing",
+    "psychoanalyse": "psychoanalyze",
+    "psychoanalysed": "psychoanalyzed",
+    "psychoanalyses": "psychoanalyzes",
+    "psychoanalysing": "psychoanalyzing",
+    "publicise": "publicize",
+    "publicised": "publicized",
+    "publicises": "publicizes",
+    "publicising": "publicizing",
+    "pulverisation": "pulverization",
+    "pulverise": "pulverize",
+    "pulverised": "pulverized",
+    "pulverises": "pulverizes",
+    "pulverising": "pulverizing",
+    "pummelled": "pummel",
+    "pummelling": "pummeled",
+    "pyjama": "pajama",
+    "pyjamas": "pajamas",
+    "pzazz": "pizzazz",
+    "quarrelled": "quarreled",
+    "quarrelling": "quarreling",
+    "radicalise": "radicalize",
+    "radicalised": "radicalized",
+    "radicalises": "radicalizes",
+    "radicalising": "radicalizing",
+    "rancour": "rancor",
+    "randomise": "randomize",
+    "randomised": "randomized",
+    "randomises": "randomizes",
+    "randomising": "randomizing",
+    "rationalisation": "rationalization",
+    "rationalisations": "rationalizations",
+    "rationalise": "rationalize",
+    "rationalised": "rationalized",
+    "rationalises": "rationalizes",
+    "rationalising": "rationalizing",
+    "ravelled": "raveled",
+    "ravelling": "raveling",
+    "realisable": "realizable",
+    "realisation": "realization",
+    "realisations": "realizations",
+    "realise": "realize",
+    "realised": "realized",
+    "realises": "realizes",
+    "realising": "realizing",
+    "recognisable": "recognizable",
+    "recognisably": "recognizably",
+    "recognisance": "recognizance",
+    "recognise": "recognize",
+    "recognised": "recognized",
+    "recognises": "recognizes",
+    "recognising": "recognizing",
+    "reconnoitre": "reconnoiter",
+    "reconnoitred": "reconnoitered",
+    "reconnoitres": "reconnoiters",
+    "reconnoitring": "reconnoitering",
+    "refuelled": "refueled",
+    "refuelling": "refueling",
+    "regularisation": "regularization",
+    "regularise": "regularize",
+    "regularised": "regularized",
+    "regularises": "regularizes",
+    "regularising": "regularizing",
+    "remodelled": "remodeled",
+    "remodelling": "remodeling",
+    "remould": "remold",
+    "remoulded": "remolded",
+    "remoulding": "remolding",
+    "remoulds": "remolds",
+    "reorganisation": "reorganization",
+    "reorganisations": "reorganizations",
+    "reorganise": "reorganize",
+    "reorganised": "reorganized",
+    "reorganises": "reorganizes",
+    "reorganising": "reorganizing",
+    "revelled": "reveled",
+    "reveller": "reveler",
+    "revellers": "revelers",
+    "revelling": "reveling",
+    "revitalise": "revitalize",
+    "revitalised": "revitalized",
+    "revitalises": "revitalizes",
+    "revitalising": "revitalizing",
+    "revolutionise": "revolutionize",
+    "revolutionised": "revolutionized",
+    "revolutionises": "revolutionizes",
+    "revolutionising": "revolutionizing",
+    "rhapsodise": "rhapsodize",
+    "rhapsodised": "rhapsodized",
+    "rhapsodises": "rhapsodizes",
+    "rhapsodising": "rhapsodizing",
+    "rigour": "rigor",
+    "rigours": "rigors",
+    "ritualised": "ritualized",
+    "rivalled": "rivaled",
+    "rivalling": "rivaling",
+    "romanticise": "romanticize",
+    "romanticised": "romanticized",
+    "romanticises": "romanticizes",
+    "romanticising": "romanticizing",
+    "rumour": "rumor",
+    "rumoured": "rumored",
+    "rumours": "rumors",
+    "sabre": "saber",
+    "sabres": "sabers",
+    "saltpetre": "saltpeter",
+    "sanitise": "sanitize",
+    "sanitised": "sanitized",
+    "sanitises": "sanitizes",
+    "sanitising": "sanitizing",
+    "satirise": "satirize",
+    "satirised": "satirized",
+    "satirises": "satirizes",
+    "satirising": "satirizing",
+    "saviour": "savior",
+    "saviours": "saviors",
+    "savour": "savor",
+    "savoured": "savored",
+    "savouries": "savories",
+    "savouring": "savoring",
+    "savours": "savors",
+    "savoury": "savory",
+    "scandalise": "scandalize",
+    "scandalised": "scandalized",
+    "scandalises": "scandalizes",
+    "scandalising": "scandalizing",
+    "sceptic": "skeptic",
+    "sceptical": "skeptical",
+    "sceptically": "skeptically",
+    "scepticism": "skepticism",
+    "sceptics": "skeptics",
+    "sceptre": "scepter",
+    "sceptres": "scepters",
+    "scrutinise": "scrutinize",
+    "scrutinised": "scrutinized",
+    "scrutinises": "scrutinizes",
+    "scrutinising": "scrutinizing",
+    "secularisation": "secularization",
+    "secularise": "secularize",
+    "secularised": "secularized",
+    "secularises": "secularizes",
+    "secularising": "secularizing",
+    "sensationalise": "sensationalize",
+    "sensationalised": "sensationalized",
+    "sensationalises": "sensationalizes",
+    "sensationalising": "sensationalizing",
+    "sensitise": "sensitize",
+    "sensitised": "sensitized",
+    "sensitises": "sensitizes",
+    "sensitising": "sensitizing",
+    "sentimentalise": "sentimentalize",
+    "sentimentalised": "sentimentalized",
+    "sentimentalises": "sentimentalizes",
+    "sentimentalising": "sentimentalizing",
+    "sepulchre": "sepulcher",
+    "sepulchres": "sepulchers",
+    "serialisation": "serialization",
+    "serialisations": "serializations",
+    "serialise": "serialize",
+    "serialised": "serialized",
+    "serialises": "serializes",
+    "serialising": "serializing",
+    "sermonise": "sermonize",
+    "sermonised": "sermonized",
+    "sermonises": "sermonizes",
+    "sermonising": "sermonizing",
+    "sheikh": "sheik",
+    "shovelled": "shoveled",
+    "shovelling": "shoveling",
+    "shrivelled": "shriveled",
+    "shrivelling": "shriveling",
+    "signalise": "signalize",
+    "signalised": "signalized",
+    "signalises": "signalizes",
+    "signalising": "signalizing",
+    "signalled": "signaled",
+    "signalling": "signaling",
+    "smoulder": "smolder",
+    "smouldered": "smoldered",
+    "smouldering": "smoldering",
+    "smoulders": "smolders",
+    "snivelled": "sniveled",
+    "snivelling": "sniveling",
+    "snorkelled": "snorkeled",
+    "snorkelling": "snorkeling",
+    "snowplough": "snowplow",
+    "snowploughs": "snowplow",
+    "socialisation": "socialization",
+    "socialise": "socialize",
+    "socialised": "socialized",
+    "socialises": "socializes",
+    "socialising": "socializing",
+    "sodomise": "sodomize",
+    "sodomised": "sodomized",
+    "sodomises": "sodomizes",
+    "sodomising": "sodomizing",
+    "solemnise": "solemnize",
+    "solemnised": "solemnized",
+    "solemnises": "solemnizes",
+    "solemnising": "solemnizing",
+    "sombre": "somber",
+    "specialisation": "specialization",
+    "specialisations": "specializations",
+    "specialise": "specialize",
+    "specialised": "specialized",
+    "specialises": "specializes",
+    "specialising": "specializing",
+    "spectre": "specter",
+    "spectres": "specters",
+    "spiralled": "spiraled",
+    "spiralling": "spiraling",
+    "splendour": "splendor",
+    "splendours": "splendors",
+    "squirrelled": "squirreled",
+    "squirrelling": "squirreling",
+    "stabilisation": "stabilization",
+    "stabilise": "stabilize",
+    "stabilised": "stabilized",
+    "stabiliser": "stabilizer",
+    "stabilisers": "stabilizers",
+    "stabilises": "stabilizes",
+    "stabilising": "stabilizing",
+    "standardisation": "standardization",
+    "standardise": "standardize",
+    "standardised": "standardized",
+    "standardises": "standardizes",
+    "standardising": "standardizing",
+    "stencilled": "stenciled",
+    "stencilling": "stenciling",
+    "sterilisation": "sterilization",
+    "sterilisations": "sterilizations",
+    "sterilise": "sterilize",
+    "sterilised": "sterilized",
+    "steriliser": "sterilizer",
+    "sterilisers": "sterilizers",
+    "sterilises": "sterilizes",
+    "sterilising": "sterilizing",
+    "stigmatisation": "stigmatization",
+    "stigmatise": "stigmatize",
+    "stigmatised": "stigmatized",
+    "stigmatises": "stigmatizes",
+    "stigmatising": "stigmatizing",
+    "storey": "story",
+    "storeys": "stories",
+    "subsidisation": "subsidization",
+    "subsidise": "subsidize",
+    "subsidised": "subsidized",
+    "subsidiser": "subsidizer",
+    "subsidisers": "subsidizers",
+    "subsidises": "subsidizes",
+    "subsidising": "subsidizing",
+    "succour": "succor",
+    "succoured": "succored",
+    "succouring": "succoring",
+    "succours": "succors",
+    "sulphate": "sulfate",
+    "sulphates": "sulfates",
+    "sulphide": "sulfide",
+    "sulphides": "sulfides",
+    "sulphur": "sulfur",
+    "sulphurous": "sulfurous",
+    "summarise": "summarize",
+    "summarised": "summarized",
+    "summarises": "summarizes",
+    "summarising": "summarizing",
+    "swivelled": "swiveled",
+    "swivelling": "swiveling",
+    "symbolise": "symbolize",
+    "symbolised": "symbolized",
+    "symbolises": "symbolizes",
+    "symbolising": "symbolizing",
+    "sympathise": "sympathize",
+    "sympathised": "sympathized",
+    "sympathiser": "sympathizer",
+    "sympathisers": "sympathizers",
+    "sympathises": "sympathizes",
+    "sympathising": "sympathizing",
+    "synchronisation": "synchronization",
+    "synchronise": "synchronize",
+    "synchronised": "synchronized",
+    "synchronises": "synchronizes",
+    "synchronising": "synchronizing",
+    "synthesise": "synthesize",
+    "synthesised": "synthesized",
+    "synthesiser": "synthesizer",
+    "synthesisers": "synthesizers",
+    "synthesises": "synthesizes",
+    "synthesising": "synthesizing",
+    "syphon": "siphon",
+    "syphoned": "siphoned",
+    "syphoning": "siphoning",
+    "syphons": "siphons",
+    "systematisation": "systematization",
+    "systematise": "systematize",
+    "systematised": "systematized",
+    "systematises": "systematizes",
+    "systematising": "systematizing",
+    "tantalise": "tantalize",
+    "tantalised": "tantalized",
+    "tantalises": "tantalizes",
+    "tantalising": "tantalizing",
+    "tantalisingly": "tantalizingly",
+    "tasselled": "tasseled",
+    "technicolour": "technicolor",
+    "temporise": "temporize",
+    "temporised": "temporized",
+    "temporises": "temporizes",
+    "temporising": "temporizing",
+    "tenderise": "tenderize",
+    "tenderised": "tenderized",
+    "tenderises": "tenderizes",
+    "tenderising": "tenderizing",
+    "terrorise": "terrorize",
+    "terrorised": "terrorized",
+    "terrorises": "terrorizes",
+    "terrorising": "terrorizing",
+    "theatre": "theater",
+    "theatregoer": "theatergoer",
+    "theatregoers": "theatergoers",
+    "theatres": "theaters",
+    "theorise": "theorize",
+    "theorised": "theorized",
+    "theorises": "theorizes",
+    "theorising": "theorizing",
+    "tonne": "ton",
+    "tonnes": "tons",
+    "towelled": "toweled",
+    "towelling": "toweling",
+    "toxaemia": "toxemia",
+    "tranquillise": "tranquilize",
+    "tranquillised": "tranquilized",
+    "tranquilliser": "tranquilizer",
+    "tranquillisers": "tranquilizers",
+    "tranquillises": "tranquilizes",
+    "tranquillising": "tranquilizing",
+    "tranquillity": "tranquility",
+    "tranquillize": "tranquilize",
+    "tranquillized": "tranquilized",
+    "tranquillizer": "tranquilizer",
+    "tranquillizers": "tranquilizers",
+    "tranquillizes": "tranquilizes",
+    "tranquillizing": "tranquilizing",
+    "tranquilly": "tranquility",
+    "transistorised": "transistorized",
+    "traumatise": "traumatize",
+    "traumatised": "traumatized",
+    "traumatises": "traumatizes",
+    "traumatising": "traumatizing",
+    "travelled": "traveled",
+    "traveller": "traveler",
+    "travellers": "travelers",
+    "travelling": "traveling",
+    "travelog": "travelogue",
+    "travelogs": "travelogues",
+    "trialled": "trialed",
+    "trialling": "trialing",
+    "tricolour": "tricolor",
+    "tricolours": "tricolors",
+    "trivialise": "trivialize",
+    "trivialised": "trivialized",
+    "trivialises": "trivializes",
+    "trivialising": "trivializing",
+    "tumour": "tumor",
+    "tumours": "tumors",
+    "tunnelled": "tunneled",
+    "tunnelling": "tunneling",
+    "tyrannise": "tyrannize",
+    "tyrannised": "tyrannized",
+    "tyrannises": "tyrannizes",
+    "tyrannising": "tyrannizing",
+    "tyre": "tire",
+    "tyres": "tires",
+    "unauthorised": "unauthorized",
+    "uncivilised": "uncivilized",
+    "underutilised": "underutilized",
+    "unequalled": "unequaled",
+    "unfavourable": "unfavorable",
+    "unfavourably": "unfavorably",
+    "unionisation": "unionization",
+    "unionise": "unionize",
+    "unionised": "unionized",
+    "unionises": "unionizes",
+    "unionising": "unionizing",
+    "unorganised": "unorganized",
+    "unravelled": "unraveled",
+    "unravelling": "unraveling",
+    "unrecognisable": "unrecognizable",
+    "unrecognised": "unrecognized",
+    "unrivalled": "unrivaled",
+    "unsavoury": "unsavory",
+    "untrammelled": "untrammeled",
+    "urbanisation": "urbanization",
+    "urbanise": "urbanize",
+    "urbanised": "urbanized",
+    "urbanises": "urbanizes",
+    "urbanising": "urbanizing",
+    "utilisable": "utilizable",
+    "utilisation": "utilization",
+    "utilise": "utilize",
+    "utilised": "utilized",
+    "utilises": "utilizes",
+    "utilising": "utilizing",
+    "valour": "valor",
+    "vandalise": "vandalize",
+    "vandalised": "vandalized",
+    "vandalises": "vandalizes",
+    "vandalising": "vandalizing",
+    "vaporisation": "vaporization",
+    "vaporise": "vaporize",
+    "vaporised": "vaporized",
+    "vaporises": "vaporizes",
+    "vaporising": "vaporizing",
+    "vapour": "vapor",
+    "vapours": "vapors",
+    "verbalise": "verbalize",
+    "verbalised": "verbalized",
+    "verbalises": "verbalizes",
+    "verbalising": "verbalizing",
+    "victimisation": "victimization",
+    "victimise": "victimize",
+    "victimised": "victimized",
+    "victimises": "victimizes",
+    "victimising": "victimizing",
+    "videodisc": "videodisk",
+    "videodiscs": "videodisks",
+    "vigour": "vigor",
+    "visualisation": "visualization",
+    "visualisations": "visualizations",
+    "visualise": "visualize",
+    "visualised": "visualized",
+    "visualises": "visualizes",
+    "visualising": "visualizing",
+    "vocalisation": "vocalization",
+    "vocalisations": "vocalizations",
+    "vocalise": "vocalize",
+    "vocalised": "vocalized",
+    "vocalises": "vocalizes",
+    "vocalising": "vocalizing",
+    "vulcanised": "vulcanized",
+    "vulgarisation": "vulgarization",
+    "vulgarise": "vulgarize",
+    "vulgarised": "vulgarized",
+    "vulgarises": "vulgarizes",
+    "vulgarising": "vulgarizing",
+    "waggon": "wagon",
+    "waggons": "wagons",
+    "watercolour": "watercolor",
+    "watercolours": "watercolors",
+    "weaselled": "weaseled",
+    "weaselling": "weaseling",
+    "westernisation": "westernization",
+    "westernise": "westernize",
+    "westernised": "westernized",
+    "westernises": "westernizes",
+    "westernising": "westernizing",
+    "womanise": "womanize",
+    "womanised": "womanized",
+    "womaniser": "womanizer",
+    "womanisers": "womanizers",
+    "womanises": "womanizes",
+    "womanising": "womanizing",
+    "woollen": "woolen",
+    "woollens": "woolens",
+    "woollies": "woolies",
+    "woolly": "wooly",
+    "worshipped": "worshiped",
+    "worshipping": "worshiping",
+    "worshipper": "worshiper",
+    "yodelled": "yodeled",
+    "yodelling": "yodeling",
+    "yoghourt": "yogurt",
+    "yoghourts": "yogurts",
+    "yoghurt": "yogurt",
+    "yoghurts": "yogurts",
+    "mhm": "hmm",
+    "mm": "hmm",
+    "mmm": "hmm",
+}
+
+
 class EnglishSpellingNormalizer:
     """
     Applies British-American spelling mappings as listed in [1].
@@ -510,8 +2252,7 @@ class EnglishSpellingNormalizer:
     """
 
     def __init__(self):
-        mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
-        self.mapping = json.load(open(mapping_path))
+        self.mapping = ENGLISH_MAPPING
 
     def __call__(self, s: str):
         return " ".join(self.mapping.get(word, word) for word in s.split())

From ede09a4c3d27ccd0e894b7b90b2fc775e2f6ba7d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 4 Oct 2022 15:32:13 +0000
Subject: [PATCH 147/156] add get decoder prompt id

---
 .../models/whisper/processing_whisper.py      | 42 ++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 786f2dcf134fb..1536e3ab88663 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -41,12 +41,46 @@ def __init__(self, feature_extractor, tokenizer):
         self.current_processor = self.feature_extractor
         self._in_target_context_manager = False
 
-    def __call__(self, *args, **kwargs):
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True, **kwargs):
+        forced_decoder_tokens = "<|startoftranscript|>"
+        if language is not None:
+            if language not in self.tokenizer.additional_special_tokens:
+                raise ValueError(
+                    f"{language} is not supported. The language should be one of the following: '<|en|>',"
+                    " '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>',"
+                    " '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>',"
+                    " '<|vi|>', '<|iw|>', '<|uk|>', '<|el|>', '<|ms|>', '<|cs|>', '<|ro|>', '<|da|>', '<|hu|>',"
+                    " '<|ta|>', '<|no|>', '<|th|>', '<|ur|>', '<|hr|>', '<|bg|>', '<|lt|>', '<|la|>', '<|mi|>',"
+                    " '<|ml|>', '<|cy|>', '<|sk|>', '<|te|>', '<|fa|>', '<|lv|>', '<|bn|>', '<|sr|>', '<|az|>',"
+                    " '<|sl|>', '<|kn|>', '<|et|>', '<|mk|>', '<|br|>', '<|eu|>', '<|is|>', '<|hy|>', '<|ne|>',"
+                    " '<|mn|>', '<|bs|>', '<|kk|>', '<|sq|>', '<|sw|>', '<|gl|>', '<|mr|>', '<|pa|>', '<|si|>',"
+                    " '<|km|>', '<|sn|>', '<|yo|>', '<|so|>', '<|af|>', '<|oc|>', '<|ka|>', '<|be|>', '<|tg|>',"
+                    " '<|sd|>', '<|gu|>', '<|am|>', '<|yi|>', '<|lo|>', '<|uz|>', '<|fo|>', '<|ht|>', '<|ps|>',"
+                    " '<|tk|>', '<|nn|>', '<|mt|>', '<|sa|>', '<|lb|>', '<|my|>', '<|bo|>', '<|tl|>', '<|mg|>',"
+                    " '<|as|>', '<|tt|>', '<|haw|>', '<|ln|>', '<|ha|>', '<|ba|>', '<|jw|>', '<|su|>'"
+                )
+            forced_decoder_tokens += f"<|{language}|>"
+
+        if task is not None:
+            if task not in self.tokenizer.additional_special_tokens:
+                raise ValueError(f"{task} is not supported. The language should be in : \{'transcribe', 'translate'}")
+            forced_decoder_tokens += f"<|{task}|>"
+
+        forced_decoder_tokens += "<|notimestamps|>" if no_timestamps else None
+        forced_decoder_ids = self.tokenizer.encode(forced_decoder_tokens, **kwargs)
+        return forced_decoder_ids
+
+    def __call__(self, *args, language=None, task=None, no_timestamps=True, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to WhisperFeatureExtractor's
         [`~WhisperFeatureExtractor.__call__`] and returns its output. If used in the context
         [`~WhisperProcessor.as_target_processor`] this method forwards all its arguments to WhisperTokenizer's
         [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
+
+        Args: 
+            - language 
+            - task 
+            - no_timestamp
         """
         # For backward compatibility
         if self._in_target_context_manager:
@@ -66,12 +100,18 @@ def __call__(self, *args, **kwargs):
         if text is not None:
             encodings = self.tokenizer(text, **kwargs)
 
+        forced_decoder_ids = self._get_decoder_prompt_ids(task, language, no_timestamps, **kwargs)
+
         if text is None:
+            if forced_decoder_ids is not None:
+                return inputs, forced_decoder_ids
             return inputs
+
         elif audio is None:
             return encodings
         else:
             inputs["labels"] = encodings["input_ids"]
+            inputs["forced_decoder_ids"] = forced_decoder_ids
             return inputs
 
     def batch_decode(self, *args, **kwargs):

From 529746a7b741c7e4da4bc37538e80b27029e3ce4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 4 Oct 2022 20:27:04 +0000
Subject: [PATCH 148/156] revert changes and add forced logit processor

---
 src/transformers/generation_logits_process.py | 15 +++++++++
 src/transformers/generation_utils.py          | 11 ++++++-
 .../models/whisper/processing_whisper.py      | 31 +++++++------------
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 9bbd94bb4e972..8a4402a589c98 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -732,3 +732,18 @@ def __init__(self, suppress_tokens):
     def __call__(self, input_ids, scores):
         scores[:, self.suppress_tokens] = -np.inf
         return scores
+
+
+class ForceTokensLogitsProcessor(LogitsProcessor):
+    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they
+    are not sampled."""
+
+    def __init__(self, force_token_map):
+        self.force_token_map = dict(force_token_map)
+
+    def __call__(self, input_ids, scores):
+        generation_idx = input_ids.shape[-1]
+        current_token = self.force_token_map.get(generation_idx, None)
+        if current_token is not None:
+            scores[:, current_token] = np.inf
+        return scores
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index d9f3041121828..1524925481480 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -30,6 +30,7 @@
     ExponentialDecayLengthPenalty,
     ForcedBOSTokenLogitsProcessor,
     ForcedEOSTokenLogitsProcessor,
+    ForceTokensLogitsProcessor,
     HammingDiversityLogitsProcessor,
     InfNanRemoveLogitsProcessor,
     LogitNormalization,
@@ -695,6 +696,7 @@ def _get_logits_processor(
         renormalize_logits: Optional[bool],
         suppress_tokens: Optional[List[int]] = None,
         begin_suppress_tokens: Optional[List[int]] = None,
+        forced_decoder_ids: Optional[List[int]] = None,
     ) -> LogitsProcessorList:
         """
         This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
@@ -733,6 +735,8 @@ def _get_logits_processor(
         begin_suppress_tokens = (
             begin_suppress_tokens if begin_suppress_tokens is not None else self.config.begin_suppress_tokens
         )
+        if forced_decoder_ids is None and hasattr(self.config, "forced_decoder_ids"):
+            forced_decoder_ids = self.config.forced_decoder_ids
         # instantiate processors list
 
         # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
@@ -776,7 +780,8 @@ def _get_logits_processor(
             begin_index = input_ids_seq_length
             begin_index = begin_index if (input_ids_seq_length > 1 or forced_bos_token_id is None) else begin_index + 1
             processors.append(SuppressTokensAtBeginLogitsProcessor(begin_suppress_tokens, begin_index))
-
+        if forced_decoder_ids is not None:
+            processors.append(ForceTokensLogitsProcessor(forced_decoder_ids))
         processors = self._merge_criteria_processor_list(processors, logits_processor)
         # `LogitNormalization` should always be the last logit processor, when present
         if renormalize_logits is True:
@@ -949,6 +954,7 @@ def generate(
         exponential_decay_length_penalty: Optional[Tuple[Union[int, float]]] = None,
         suppress_tokens: Optional[List[int]] = None,
         begin_suppress_tokens: Optional[List[int]] = None,
+        forced_decoder_ids: Optional[List[int]] = None,
         **model_kwargs,
     ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, torch.LongTensor]:
         r"""
@@ -1113,6 +1119,8 @@ def generate(
             begin_suppress_tokens  (`List[int]`, *optional*, defaults to `model.config.begin_suppress_tokens`):
                 A list of tokens that will be supressed at the begining of the generation. The `SupressBeginTokens`
                 logit processor will set their log probs to `-inf` so that they are not sampled.
+            forced_decoder_ids (`List[int]`, *optional*, defaults to `model.config.forced_decoder_ids`):
+                A list of tokens that will be forced as beginning tokens.
 
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If the model
@@ -1362,6 +1370,7 @@ def generate(
             renormalize_logits=renormalize_logits,
             suppress_tokens=suppress_tokens,
             begin_suppress_tokens=begin_suppress_tokens,
+            forced_decoder_ids=forced_decoder_ids,
         )
 
         # 8. prepare stopping criteria
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 1536e3ab88663..c74372ca9d1bd 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -41,10 +41,16 @@ def __init__(self, feature_extractor, tokenizer):
         self.current_processor = self.feature_extractor
         self._in_target_context_manager = False
 
-    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True, **kwargs):
-        forced_decoder_tokens = "<|startoftranscript|>"
+    def _get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+        forced_decoder_tokens = ""
+        
+        if task is not None:
+            if f"<|{task}|>" not in self.tokenizer.additional_special_tokens:
+                raise ValueError(f"'{task}' is not supported. The language should be in : {{'transcribe', 'translate'}}")
+            forced_decoder_tokens += f"<|{task}|>"
+            
         if language is not None:
-            if language not in self.tokenizer.additional_special_tokens:
+            if f"<|{language}|>" not in self.tokenizer.additional_special_tokens:
                 raise ValueError(
                     f"{language} is not supported. The language should be one of the following: '<|en|>',"
                     " '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>',"
@@ -61,26 +67,18 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True, *
                 )
             forced_decoder_tokens += f"<|{language}|>"
 
-        if task is not None:
-            if task not in self.tokenizer.additional_special_tokens:
-                raise ValueError(f"{task} is not supported. The language should be in : \{'transcribe', 'translate'}")
-            forced_decoder_tokens += f"<|{task}|>"
-
         forced_decoder_tokens += "<|notimestamps|>" if no_timestamps else None
-        forced_decoder_ids = self.tokenizer.encode(forced_decoder_tokens, **kwargs)
+        ids = self.tokenizer.encode(forced_decoder_tokens)
+        forced_decoder_ids = [ (rank +1 ,token) for rank,token in enumerate(ids)]
         return forced_decoder_ids
 
-    def __call__(self, *args, language=None, task=None, no_timestamps=True, **kwargs):
+    def __call__(self, *args, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to WhisperFeatureExtractor's
         [`~WhisperFeatureExtractor.__call__`] and returns its output. If used in the context
         [`~WhisperProcessor.as_target_processor`] this method forwards all its arguments to WhisperTokenizer's
         [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
 
-        Args: 
-            - language 
-            - task 
-            - no_timestamp
         """
         # For backward compatibility
         if self._in_target_context_manager:
@@ -100,18 +98,13 @@ def __call__(self, *args, language=None, task=None, no_timestamps=True, **kwargs
         if text is not None:
             encodings = self.tokenizer(text, **kwargs)
 
-        forced_decoder_ids = self._get_decoder_prompt_ids(task, language, no_timestamps, **kwargs)
-
         if text is None:
-            if forced_decoder_ids is not None:
-                return inputs, forced_decoder_ids
             return inputs
 
         elif audio is None:
             return encodings
         else:
             inputs["labels"] = encodings["input_ids"]
-            inputs["forced_decoder_ids"] = forced_decoder_ids
             return inputs
 
     def batch_decode(self, *args, **kwargs):

From d403c9dc22f43a56049e51521fc5c7a5b7ebff9d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 4 Oct 2022 20:32:24 +0000
Subject: [PATCH 149/156] nit

---
 src/transformers/models/whisper/processing_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index c74372ca9d1bd..ed8aada73bda9 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -67,7 +67,7 @@ def _get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
                 )
             forced_decoder_tokens += f"<|{language}|>"
 
-        forced_decoder_tokens += "<|notimestamps|>" if no_timestamps else None
+        forced_decoder_tokens += "<|notimestamps|>" if no_timestamps else ""
         ids = self.tokenizer.encode(forced_decoder_tokens)
         forced_decoder_ids = [ (rank +1 ,token) for rank,token in enumerate(ids)]
         return forced_decoder_ids

From b82fe098e4d46b2efe20d84ba1e7d81058d51cd2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 4 Oct 2022 21:24:08 +0000
Subject: [PATCH 150/156] clean normalizer

---
 .../models/whisper/english_normalizer.py      | 1752 +----------------
 .../models/whisper/processing_whisper.py      |   10 +-
 .../models/whisper/tokenization_whisper.py    |   17 +-
 3 files changed, 25 insertions(+), 1754 deletions(-)

diff --git a/src/transformers/models/whisper/english_normalizer.py b/src/transformers/models/whisper/english_normalizer.py
index 08cc247817584..11912bcc55b72 100644
--- a/src/transformers/models/whisper/english_normalizer.py
+++ b/src/transformers/models/whisper/english_normalizer.py
@@ -500,1750 +500,6 @@ def __call__(self, s: str):
         return s
 
 
-ENGLISH_MAPPING = {
-    "accessorise": "accessorize",
-    "accessorised": "accessorized",
-    "accessorises": "accessorizes",
-    "accessorising": "accessorizing",
-    "acclimatisation": "acclimatization",
-    "acclimatise": "acclimatize",
-    "acclimatised": "acclimatized",
-    "acclimatises": "acclimatizes",
-    "acclimatising": "acclimatizing",
-    "accoutrements": "accouterments",
-    "aeon": "eon",
-    "aeons": "eons",
-    "aerogramme": "aerogram",
-    "aerogrammes": "aerograms",
-    "aeroplane": "airplane",
-    "aeroplanes": "airplanes",
-    "aesthete": "esthete",
-    "aesthetes": "esthetes",
-    "aesthetic": "esthetic",
-    "aesthetically": "esthetically",
-    "aesthetics": "esthetics",
-    "aetiology": "etiology",
-    "ageing": "aging",
-    "aggrandisement": "aggrandizement",
-    "agonise": "agonize",
-    "agonised": "agonized",
-    "agonises": "agonizes",
-    "agonising": "agonizing",
-    "agonisingly": "agonizingly",
-    "almanack": "almanac",
-    "almanacks": "almanacs",
-    "aluminium": "aluminum",
-    "amortisable": "amortizable",
-    "amortisation": "amortization",
-    "amortisations": "amortizations",
-    "amortise": "amortize",
-    "amortised": "amortized",
-    "amortises": "amortizes",
-    "amortising": "amortizing",
-    "amphitheatre": "amphitheater",
-    "amphitheatres": "amphitheaters",
-    "anaemia": "anemia",
-    "anaemic": "anemic",
-    "anaesthesia": "anesthesia",
-    "anaesthetic": "anesthetic",
-    "anaesthetics": "anesthetics",
-    "anaesthetise": "anesthetize",
-    "anaesthetised": "anesthetized",
-    "anaesthetises": "anesthetizes",
-    "anaesthetising": "anesthetizing",
-    "anaesthetist": "anesthetist",
-    "anaesthetists": "anesthetists",
-    "anaesthetize": "anesthetize",
-    "anaesthetized": "anesthetized",
-    "anaesthetizes": "anesthetizes",
-    "anaesthetizing": "anesthetizing",
-    "analogue": "analog",
-    "analogues": "analogs",
-    "analyse": "analyze",
-    "analysed": "analyzed",
-    "analyses": "analyzes",
-    "analysing": "analyzing",
-    "anglicise": "anglicize",
-    "anglicised": "anglicized",
-    "anglicises": "anglicizes",
-    "anglicising": "anglicizing",
-    "annualised": "annualized",
-    "antagonise": "antagonize",
-    "antagonised": "antagonized",
-    "antagonises": "antagonizes",
-    "antagonising": "antagonizing",
-    "apologise": "apologize",
-    "apologised": "apologized",
-    "apologises": "apologizes",
-    "apologising": "apologizing",
-    "appal": "appall",
-    "appals": "appalls",
-    "appetiser": "appetizer",
-    "appetisers": "appetizers",
-    "appetising": "appetizing",
-    "appetisingly": "appetizingly",
-    "arbour": "arbor",
-    "arbours": "arbors",
-    "archeological": "archaeological",
-    "archaeologically": "archeologically",
-    "archaeologist": "archeologist",
-    "archaeologists": "archeologists",
-    "archaeology": "archeology</span>",
-    "ardour": "ardor",
-    "armour": "armor",
-    "armoured": "armored",
-    "armourer": "armorer",
-    "armourers": "armorers",
-    "armouries": "armories",
-    "armoury": "armory",
-    "artefact": "artifact",
-    "artefacts": "artifacts",
-    "authorise": "authorize",
-    "authorised": "authorized",
-    "authorises": "authorizes",
-    "authorising": "authorizing",
-    "axe": "ax",
-    "backpedalled": "backpedaled",
-    "backpedalling": "backpedaling",
-    "bannister": "banister",
-    "bannisters": "banisters",
-    "baptise": "baptize",
-    "baptised": "baptized",
-    "baptises": "baptizes",
-    "baptising": "baptizing",
-    "bastardise": "bastardize",
-    "bastardised": "bastardized",
-    "bastardises": "bastardizes",
-    "bastardising": "bastardizing",
-    "battleax": "battleaxe",
-    "baulk": "balk",
-    "baulked": "balked",
-    "baulking": "balking",
-    "baulks": "balks",
-    "bedevilled": "bedeviled",
-    "bedevilling": "bedeviling",
-    "behaviour": "behavior",
-    "behavioural": "behavioral",
-    "behaviourism": "behaviorism",
-    "behaviourist": "behaviorist",
-    "behaviourists": "behaviorists",
-    "behaviours": "behaviors",
-    "behove": "behoove",
-    "behoved": "behooved",
-    "behoves": "behooves",
-    "bejewelled": "bejeweled",
-    "belabour": "belabor",
-    "belaboured": "belabored",
-    "belabouring": "belaboring",
-    "belabours": "belabors",
-    "bevelled": "beveled",
-    "bevvies": "bevies",
-    "bevvy": "bevy",
-    "biassed": "biased",
-    "biassing": "biasing",
-    "bingeing": "binging",
-    "bougainvillaea": "bougainvillea",
-    "bougainvillaeas": "bougainvilleas",
-    "bowdlerise": "bowdlerize",
-    "bowdlerised": "bowdlerized",
-    "bowdlerises": "bowdlerizes",
-    "bowdlerising": "bowdlerizing",
-    "breathalyse": "breathalyze",
-    "breathalysed": "breathalyzed",
-    "breathalyser": "breathalyzer",
-    "breathalysers": "breathalyzers",
-    "breathalyses": "breathalyzes",
-    "breathalysing": "breathalyzing",
-    "brutalise": "brutalize",
-    "brutalised": "brutalized",
-    "brutalises": "brutalizes",
-    "brutalising": "brutalizing",
-    "busses": "buses",
-    "bussing": "busing",
-    "caesarean": "cesarean",
-    "caesareans": "cesareans",
-    "calibre": "caliber",
-    "calibres": "calibers",
-    "calliper": "caliper",
-    "callipers": "calipers",
-    "callisthenics": "calisthenics",
-    "canalise": "canalize",
-    "canalised": "canalized",
-    "canalises": "canalizes",
-    "canalising": "canalizing",
-    "cancelation": "cancellation",
-    "cancelations": "cancellations",
-    "cancelled": "canceled",
-    "cancelling": "canceling",
-    "candour": "candor",
-    "cannibalise": "cannibalize",
-    "cannibalised": "cannibalized",
-    "cannibalises": "cannibalizes",
-    "cannibalising": "cannibalizing",
-    "canonise": "canonize",
-    "canonised": "canonized",
-    "canonises": "canonizes",
-    "canonising": "canonizing",
-    "capitalise": "capitalize",
-    "capitalised": "capitalized",
-    "capitalises": "capitalizes",
-    "capitalising": "capitalizing",
-    "caramelise": "caramelize",
-    "caramelised": "caramelized",
-    "caramelises": "caramelizes",
-    "caramelising": "caramelizing",
-    "carbonise": "carbonize",
-    "carbonised": "carbonized",
-    "carbonises": "carbonizes",
-    "carbonising": "carbonizing",
-    "carolled": "caroled",
-    "carolling": "caroling",
-    "catalogue": "catalog",
-    "catalogued": "cataloged",
-    "catalogues": "catalogs",
-    "cataloguing": "cataloging",
-    "catalyse": "catalyze",
-    "catalysed": "catalyzed",
-    "catalyses": "catalyzes",
-    "catalysing": "catalyzing",
-    "categorise": "categorize",
-    "categorised": "categorized",
-    "categorises": "categorizes",
-    "categorising": "categorizing",
-    "cauterise": "cauterize",
-    "cauterised": "cauterized",
-    "cauterises": "cauterizes",
-    "cauterising": "cauterizing",
-    "cavilled": "caviled",
-    "cavilling": "caviling",
-    "centigramme": "centigram",
-    "centigrammes": "centigrams",
-    "centilitre": "centiliter",
-    "centilitres": "centiliters",
-    "centimetre": "centimeter",
-    "centimetres": "centimeters",
-    "centralise": "centralize",
-    "centralised": "centralized",
-    "centralises": "centralizes",
-    "centralising": "centralizing",
-    "centre": "center",
-    "centred": "centered",
-    "centrefold": "centerfold",
-    "centrefolds": "centerfolds",
-    "centrepiece": "centerpiece",
-    "centrepieces": "centerpieces",
-    "centres": "centers",
-    "channelled": "channeled",
-    "channelling": "channeling",
-    "characterise": "characterize",
-    "characterised": "characterized",
-    "characterises": "characterizes",
-    "characterising": "characterizing",
-    "cheque": "check",
-    "chequebook": "checkbook",
-    "chequebooks": "checkbooks",
-    "chequered": "checkered",
-    "cheques": "checks",
-    "chilli": "chili",
-    "chimaera": "chimera",
-    "chimaeras": "chimeras",
-    "chiselled": "chiseled",
-    "chiselling": "chiseling",
-    "circularise": "circularize",
-    "circularised": "circularized",
-    "circularises": "circularizes",
-    "circularising": "circularizing",
-    "civilise": "civilize",
-    "civilised": "civilized",
-    "civilises": "civilizes",
-    "civilising": "civilizing",
-    "clamour": "clamor",
-    "clamoured": "clamored",
-    "clamouring": "clamoring",
-    "clamours": "clamors",
-    "clangour": "clangor",
-    "clarinettist": "clarinetist",
-    "clarinettists": "clarinetists",
-    "collectivise": "collectivize",
-    "collectivised": "collectivized",
-    "collectivises": "collectivizes",
-    "collectivising": "collectivizing",
-    "colonisation": "colonization",
-    "colonise": "colonize",
-    "colonised": "colonized",
-    "coloniser": "colonizer",
-    "colonisers": "colonizers",
-    "colonises": "colonizes",
-    "colonising": "colonizing",
-    "colour": "color",
-    "colourant": "colorant",
-    "colourants": "colorants",
-    "coloured": "colored",
-    "coloureds": "coloreds",
-    "colourful": "colorful",
-    "colourfully": "colorfully",
-    "colouring": "coloring",
-    "colourize": "colorize",
-    "colourized": "colorized",
-    "colourizes": "colorizes",
-    "colourizing": "colorizing",
-    "colourless": "colorless",
-    "colours": "colors",
-    "commercialise": "commercialize",
-    "commercialised": "commercialized",
-    "commercialises": "commercializes",
-    "commercialising": "commercializing",
-    "compartmentalise": "compartmentalize",
-    "compartmentalised": "compartmentalized",
-    "compartmentalises": "compartmentalizes",
-    "compartmentalising": "compartmentalizing",
-    "computerise": "computerize",
-    "computerised": "computerized",
-    "computerises": "computerizes",
-    "computerising": "computerizing",
-    "conceptualise": "conceptualize",
-    "conceptualised": "conceptualized",
-    "conceptualises": "conceptualizes",
-    "conceptualising": "conceptualizing",
-    "connexion": "connection",
-    "connexions": "connections",
-    "contextualise": "contextualize",
-    "contextualised": "contextualized",
-    "contextualises": "contextualizes",
-    "contextualising": "contextualizing",
-    "cosier": "cozier",
-    "cosies": "cozies",
-    "cosiest": "coziest",
-    "cosily": "cozily",
-    "cosiness": "coziness",
-    "cosy": "cozy",
-    "councillor": "councilor",
-    "councillors": "councilors",
-    "counselled": "counseled",
-    "counselling": "counseling",
-    "counsellor": "counselor",
-    "counsellors": "counselors",
-    "crenelated": "crenellated",
-    "criminalise": "criminalize",
-    "criminalised": "criminalized",
-    "criminalises": "criminalizes",
-    "criminalising": "criminalizing",
-    "criticise": "criticize",
-    "criticised": "criticized",
-    "criticises": "criticizes",
-    "criticising": "criticizing",
-    "crueller": "crueler",
-    "cruellest": "cruelest",
-    "crystallisation": "crystallization",
-    "crystallise": "crystallize",
-    "crystallised": "crystallized",
-    "crystallises": "crystallizes",
-    "crystallising": "crystallizing",
-    "cudgelled": "cudgeled",
-    "cudgelling": "cudgeling",
-    "customise": "customize",
-    "customised": "customized",
-    "customises": "customizes",
-    "customising": "customizing",
-    "cypher": "cipher",
-    "cyphers": "ciphers",
-    "decentralisation": "decentralization",
-    "decentralise": "decentralize",
-    "decentralised": "decentralized",
-    "decentralises": "decentralizes",
-    "decentralising": "decentralizing",
-    "decriminalisation": "decriminalization",
-    "decriminalise": "decriminalize",
-    "decriminalised": "decriminalized",
-    "decriminalises": "decriminalizes",
-    "decriminalising": "decriminalizing",
-    "defence": "defense",
-    "defenceless": "defenseless",
-    "defences": "defenses",
-    "dehumanisation": "dehumanization",
-    "dehumanise": "dehumanize",
-    "dehumanised": "dehumanized",
-    "dehumanises": "dehumanizes",
-    "dehumanising": "dehumanizing",
-    "demeanour": "demeanor",
-    "demilitarisation": "demilitarization",
-    "demilitarise": "demilitarize",
-    "demilitarised": "demilitarized",
-    "demilitarises": "demilitarizes",
-    "demilitarising": "demilitarizing",
-    "demobilisation": "demobilization",
-    "demobilise": "demobilize",
-    "demobilised": "demobilized",
-    "demobilises": "demobilizes",
-    "demobilising": "demobilizing",
-    "democratisation": "democratization",
-    "democratise": "democratize",
-    "democratised": "democratized",
-    "democratises": "democratizes",
-    "democratising": "democratizing",
-    "demonise": "demonize",
-    "demonised": "demonized",
-    "demonises": "demonizes",
-    "demonising": "demonizing",
-    "demoralisation": "demoralization",
-    "demoralise": "demoralize",
-    "demoralised": "demoralized",
-    "demoralises": "demoralizes",
-    "demoralising": "demoralizing",
-    "denationalisation": "denationalization",
-    "denationalise": "denationalize",
-    "denationalised": "denationalized",
-    "denationalises": "denationalizes",
-    "denationalising": "denationalizing",
-    "deodorise": "deodorize",
-    "deodorised": "deodorized",
-    "deodorises": "deodorizes",
-    "deodorising": "deodorizing",
-    "depersonalise": "depersonalize",
-    "depersonalised": "depersonalized",
-    "depersonalises": "depersonalizes",
-    "depersonalising": "depersonalizing",
-    "deputise": "deputize",
-    "deputised": "deputized",
-    "deputises": "deputizes",
-    "deputising": "deputizing",
-    "desensitisation": "desensitization",
-    "desensitise": "desensitize",
-    "desensitised": "desensitized",
-    "desensitises": "desensitizes",
-    "desensitising": "desensitizing",
-    "destabilisation": "destabilization",
-    "destabilise": "destabilize",
-    "destabilised": "destabilized",
-    "destabilises": "destabilizes",
-    "destabilising": "destabilizing",
-    "dialled": "dialed",
-    "dialling": "dialing",
-    "dialogue": "dialog",
-    "dialogues": "dialogs",
-    "diarrhoea": "diarrhea",
-    "digitise": "digitize",
-    "digitised": "digitized",
-    "digitises": "digitizes",
-    "digitising": "digitizing",
-    "disc": "disk",
-    "discolour": "discolor",
-    "discoloured": "discolored",
-    "discolouring": "discoloring",
-    "discolours": "discolors",
-    "discs": "disks",
-    "disembowelled": "disemboweled",
-    "disembowelling": "disemboweling",
-    "disfavour": "disfavor",
-    "dishevelled": "disheveled",
-    "dishonour": "dishonor",
-    "dishonourable": "dishonorable",
-    "dishonourably": "dishonorably",
-    "dishonoured": "dishonored",
-    "dishonouring": "dishonoring",
-    "dishonours": "dishonors",
-    "disorganisation": "disorganization",
-    "disorganised": "disorganized",
-    "distil": "distill",
-    "distils": "distills",
-    "dramatisation": "dramatization",
-    "dramatisations": "dramatizations",
-    "dramatise": "dramatize",
-    "dramatised": "dramatized",
-    "dramatises": "dramatizes",
-    "dramatising": "dramatizing",
-    "draught": "draft",
-    "draughtboard": "draftboard",
-    "draughtboards": "draftboards",
-    "draughtier": "draftier",
-    "draughtiest": "draftiest",
-    "draughts": "drafts",
-    "draughtsman": "draftsman",
-    "draughtsmanship": "draftsmanship",
-    "draughtsmen": "draftsmen",
-    "draughtswoman": "draftswoman",
-    "draughtswomen": "draftswomen",
-    "draughty": "drafty",
-    "drivelled": "driveled",
-    "drivelling": "driveling",
-    "duelled": "dueled",
-    "duelling": "dueling",
-    "economise": "economize",
-    "economised": "economized",
-    "economises": "economizes",
-    "economising": "economizing",
-    "edoema": "edema",
-    "editorialise": "editorialize",
-    "editorialised": "editorialized",
-    "editorialises": "editorializes",
-    "editorialising": "editorializing",
-    "empathise": "empathize",
-    "empathised": "empathized",
-    "empathises": "empathizes",
-    "empathising": "empathizing",
-    "emphasise": "emphasize",
-    "emphasised": "emphasized",
-    "emphasises": "emphasizes",
-    "emphasising": "emphasizing",
-    "enamelled": "enameled",
-    "enamelling": "enameling",
-    "enamoured": "enamored",
-    "encyclopaedia": "encyclopedia",
-    "encyclopaedias": "encyclopedias",
-    "encyclopaedic": "encyclopedic",
-    "endeavour": "endeavor",
-    "endeavoured": "endeavored",
-    "endeavouring": "endeavoring",
-    "endeavours": "endeavors",
-    "energise": "energize",
-    "energised": "energized",
-    "energises": "energizes",
-    "energising": "energizing",
-    "enrol": "enroll",
-    "enrols": "enrolls",
-    "enthral": "enthrall",
-    "enthrals": "enthralls",
-    "epaulette": "epaulet",
-    "epaulettes": "epaulets",
-    "epicentre": "epicenter",
-    "epicentres": "epicenters",
-    "epilogue": "epilog",
-    "epilogues": "epilogs",
-    "epitomise": "epitomize",
-    "epitomised": "epitomized",
-    "epitomises": "epitomizes",
-    "epitomising": "epitomizing",
-    "equalisation": "equalization",
-    "equalise": "equalize",
-    "equalised": "equalized",
-    "equaliser": "equalizer",
-    "equalisers": "equalizers",
-    "equalises": "equalizes",
-    "equalising": "equalizing",
-    "eulogise": "eulogize",
-    "eulogised": "eulogized",
-    "eulogises": "eulogizes",
-    "eulogising": "eulogizing",
-    "evangelise": "evangelize",
-    "evangelised": "evangelized",
-    "evangelises": "evangelizes",
-    "evangelising": "evangelizing",
-    "exorcise": "exorcize",
-    "exorcised": "exorcized",
-    "exorcises": "exorcizes",
-    "exorcising": "exorcizing",
-    "extemporisation": "extemporization",
-    "extemporise": "extemporize",
-    "extemporised": "extemporized",
-    "extemporises": "extemporizes",
-    "extemporising": "extemporizing",
-    "externalisation": "externalization",
-    "externalisations": "externalizations",
-    "externalise": "externalize",
-    "externalised": "externalized",
-    "externalises": "externalizes",
-    "externalising": "externalizing",
-    "factorise": "factorize",
-    "factorised": "factorized",
-    "factorises": "factorizes",
-    "factorising": "factorizing",
-    "faecal": "fecal",
-    "faeces": "feces",
-    "familiarisation": "familiarization",
-    "familiarise": "familiarize",
-    "familiarised": "familiarized",
-    "familiarises": "familiarizes",
-    "familiarising": "familiarizing",
-    "fantasise": "fantasize",
-    "fantasised": "fantasized",
-    "fantasises": "fantasizes",
-    "fantasising": "fantasizing",
-    "favour": "favor",
-    "favourable": "favorable",
-    "favourably": "favorably",
-    "favoured": "favored",
-    "favouring": "favoring",
-    "favourite": "favorite",
-    "favourites": "favorites",
-    "favouritism": "favoritism",
-    "favours": "favors",
-    "feminise": "feminize",
-    "feminised": "feminized",
-    "feminises": "feminizes",
-    "feminising": "feminizing",
-    "fertilisation": "fertilization",
-    "fertilise": "fertilize",
-    "fertilised": "fertilized",
-    "fertiliser": "fertilizer",
-    "fertilisers": "fertilizers",
-    "fertilises": "fertilizes",
-    "fertilising": "fertilizing",
-    "fervour": "fervor",
-    "fibre": "fiber",
-    "fibreglass": "fiberglass",
-    "fibres": "fibers",
-    "fictionalisation": "fictionalization",
-    "fictionalisations": "fictionalizations",
-    "fictionalise": "fictionalize",
-    "fictionalised": "fictionalized",
-    "fictionalises": "fictionalizes",
-    "fictionalising": "fictionalizing",
-    "fillet": "filet",
-    "filleted": "fileted",
-    "filleting": "fileting",
-    "fillets": "filets",
-    "finalisation": "finalization",
-    "finalise": "finalize",
-    "finalised": "finalized",
-    "finalises": "finalizes",
-    "finalising": "finalizing",
-    "flautist": "flutist",
-    "flautists": "flutists",
-    "flavour": "flavor",
-    "flavoured": "flavored",
-    "flavouring": "flavoring",
-    "flavourings": "flavorings",
-    "flavourless": "flavorless",
-    "flavours": "flavors",
-    "flavoursome": "flavorsome",
-    "flyer / flier": "flier / flyer",
-    "foetal": "fetal",
-    "foetid": "fetid",
-    "foetus": "fetus",
-    "foetuses": "fetuses",
-    "formalisation": "formalization",
-    "formalise": "formalize",
-    "formalised": "formalized",
-    "formalises": "formalizes",
-    "formalising": "formalizing",
-    "fossilisation": "fossilization",
-    "fossilise": "fossilize",
-    "fossilised": "fossilized",
-    "fossilises": "fossilizes",
-    "fossilising": "fossilizing",
-    "fraternisation": "fraternization",
-    "fraternise": "fraternize",
-    "fraternised": "fraternized",
-    "fraternises": "fraternizes",
-    "fraternising": "fraternizing",
-    "fulfil": "fulfill",
-    "fulfilment": "fulfillment",
-    "fulfils": "fulfills",
-    "funnelled": "funneled",
-    "funnelling": "funneling",
-    "galvanise": "galvanize",
-    "galvanised": "galvanized",
-    "galvanises": "galvanizes",
-    "galvanising": "galvanizing",
-    "gambolled": "gamboled",
-    "gambolling": "gamboling",
-    "gaol": "jail",
-    "gaolbird": "jailbird",
-    "gaolbirds": "jailbirds",
-    "gaolbreak": "jailbreak",
-    "gaolbreaks": "jailbreaks",
-    "gaoled": "jailed",
-    "gaoler": "jailer",
-    "gaolers": "jailers",
-    "gaoling": "jailing",
-    "gaols": "jails",
-    "gasses": "gases",
-    "gage": "gauge",
-    "gaged": "gauged",
-    "gages": "gauges",
-    "gaging": "gauging",
-    "generalisation": "generalization",
-    "generalisations": "generalizations",
-    "generalise": "generalize",
-    "generalised": "generalized",
-    "generalises": "generalizes",
-    "generalising": "generalizing",
-    "ghettoise": "ghettoize",
-    "ghettoised": "ghettoized",
-    "ghettoises": "ghettoizes",
-    "ghettoising": "ghettoizing",
-    "gipsies": "gypsies",
-    "glamorise": "glamorize",
-    "glamorised": "glamorized",
-    "glamorises": "glamorizes",
-    "glamorising": "glamorizing",
-    "glamor": "glamour",
-    "globalisation": "globalization",
-    "globalise": "globalize",
-    "globalised": "globalized",
-    "globalises": "globalizes",
-    "globalising": "globalizing",
-    "glueing": "gluing",
-    "goitre": "goiter",
-    "goitres": "goiters",
-    "gonorrhoea": "gonorrhea",
-    "gramme": "gram",
-    "grammes": "grams",
-    "gravelled": "graveled",
-    "grey": "gray",
-    "greyed": "grayed",
-    "greying": "graying",
-    "greyish": "grayish",
-    "greyness": "grayness",
-    "greys": "grays",
-    "grovelled": "groveled",
-    "grovelling": "groveling",
-    "groyne": "groin",
-    "groynes": "groins",
-    "gruelling": "grueling",
-    "gruellingly": "gruelingly",
-    "gryphon": "griffin",
-    "gryphons": "griffins",
-    "gynaecological": "gynecological",
-    "gynaecologist": "gynecologist",
-    "gynaecologists": "gynecologists",
-    "gynaecology": "gynecology",
-    "haematological": "hematological",
-    "haematologist": "hematologist",
-    "haematologists": "hematologists",
-    "haematology": "hematology",
-    "haemoglobin": "hemoglobin",
-    "haemophilia": "hemophilia",
-    "haemophiliac": "hemophiliac",
-    "haemophiliacs": "hemophiliacs",
-    "haemorrhage": "hemorrhage",
-    "haemorrhaged": "hemorrhaged",
-    "haemorrhages": "hemorrhages",
-    "haemorrhaging": "hemorrhaging",
-    "haemorrhoids": "hemorrhoids",
-    "harbour": "harbor",
-    "harboured": "harbored",
-    "harbouring": "harboring",
-    "harbours": "harbors",
-    "harmonisation": "harmonization",
-    "harmonise": "harmonize",
-    "harmonised": "harmonized",
-    "harmonises": "harmonizes",
-    "harmonising": "harmonizing",
-    "homoeopath": "homeopath",
-    "homoeopathic": "homeopathic",
-    "homoeopaths": "homeopaths",
-    "homoeopathy": "homeopathy",
-    "homogenise": "homogenize",
-    "homogenised": "homogenized",
-    "homogenises": "homogenizes",
-    "homogenising": "homogenizing",
-    "honour": "honor",
-    "honourable": "honorable",
-    "honourably": "honorably",
-    "honoured": "honored",
-    "honouring": "honoring",
-    "honours": "honors",
-    "hospitalisation": "hospitalization",
-    "hospitalise": "hospitalize",
-    "hospitalised": "hospitalized",
-    "hospitalises": "hospitalizes",
-    "hospitalising": "hospitalizing",
-    "humanise": "humanize",
-    "humanised": "humanized",
-    "humanises": "humanizes",
-    "humanising": "humanizing",
-    "humour": "humor",
-    "humoured": "humored",
-    "humouring": "humoring",
-    "humourless": "humorless",
-    "humours": "humors",
-    "hybridise": "hybridize",
-    "hybridised": "hybridized",
-    "hybridises": "hybridizes",
-    "hybridising": "hybridizing",
-    "hypnotise": "hypnotize",
-    "hypnotised": "hypnotized",
-    "hypnotises": "hypnotizes",
-    "hypnotising": "hypnotizing",
-    "hypothesise": "hypothesize",
-    "hypothesised": "hypothesized",
-    "hypothesises": "hypothesizes",
-    "hypothesising": "hypothesizing",
-    "idealisation": "idealization",
-    "idealise": "idealize",
-    "idealised": "idealized",
-    "idealises": "idealizes",
-    "idealising": "idealizing",
-    "idolise": "idolize",
-    "idolised": "idolized",
-    "idolises": "idolizes",
-    "idolising": "idolizing",
-    "immobilisation": "immobilization",
-    "immobilise": "immobilize",
-    "immobilised": "immobilized",
-    "immobiliser": "immobilizer",
-    "immobilisers": "immobilizers",
-    "immobilises": "immobilizes",
-    "immobilising": "immobilizing",
-    "immortalise": "immortalize",
-    "immortalised": "immortalized",
-    "immortalises": "immortalizes",
-    "immortalising": "immortalizing",
-    "immunisation": "immunization",
-    "immunise": "immunize",
-    "immunised": "immunized",
-    "immunises": "immunizes",
-    "immunising": "immunizing",
-    "impanelled": "impaneled",
-    "impanelling": "impaneling",
-    "imperilled": "imperiled",
-    "imperilling": "imperiling",
-    "individualise": "individualize",
-    "individualised": "individualized",
-    "individualises": "individualizes",
-    "individualising": "individualizing",
-    "industrialise": "industrialize",
-    "industrialised": "industrialized",
-    "industrialises": "industrializes",
-    "industrialising": "industrializing",
-    "inflexion": "inflection",
-    "inflexions": "inflections",
-    "initialise": "initialize",
-    "initialised": "initialized",
-    "initialises": "initializes",
-    "initialising": "initializing",
-    "initialled": "initialed",
-    "initialling": "initialing",
-    "instal": "install",
-    "instalment": "installment",
-    "instalments": "installments",
-    "instals": "installs",
-    "instil": "instill",
-    "instils": "instills",
-    "institutionalisation": "institutionalization",
-    "institutionalise": "institutionalize",
-    "institutionalised": "institutionalized",
-    "institutionalises": "institutionalizes",
-    "institutionalising": "institutionalizing",
-    "intellectualise": "intellectualize",
-    "intellectualised": "intellectualized",
-    "intellectualises": "intellectualizes",
-    "intellectualising": "intellectualizing",
-    "internalisation": "internalization",
-    "internalise": "internalize",
-    "internalised": "internalized",
-    "internalises": "internalizes",
-    "internalising": "internalizing",
-    "internationalisation": "internationalization",
-    "internationalise": "internationalize",
-    "internationalised": "internationalized",
-    "internationalises": "internationalizes",
-    "internationalising": "internationalizing",
-    "ionisation": "ionization",
-    "ionise": "ionize",
-    "ionised": "ionized",
-    "ioniser": "ionizer",
-    "ionisers": "ionizers",
-    "ionises": "ionizes",
-    "ionising": "ionizing",
-    "italicise": "italicize",
-    "italicised": "italicized",
-    "italicises": "italicizes",
-    "italicising": "italicizing",
-    "itemise": "itemize",
-    "itemised": "itemized",
-    "itemises": "itemizes",
-    "itemising": "itemizing",
-    "jeopardise": "jeopardize",
-    "jeopardised": "jeopardized",
-    "jeopardises": "jeopardizes",
-    "jeopardising": "jeopardizing",
-    "jewelled": "jeweled",
-    "jeweller": "jeweler",
-    "jewellers": "jewelers",
-    "jewellery": "jewelry",
-    "judgement": "judgment",
-    "kilogramme": "kilogram",
-    "kilogrammes": "kilograms",
-    "kilometre": "kilometer",
-    "kilometres": "kilometers",
-    "labelled": "labeled",
-    "labelling": "labeling",
-    "labour": "labor",
-    "laboured": "labored",
-    "labourer": "laborer",
-    "labourers": "laborers",
-    "labouring": "laboring",
-    "labours": "labors",
-    "lacklustre": "lackluster",
-    "legalisation": "legalization",
-    "legalise": "legalize",
-    "legalised": "legalized",
-    "legalises": "legalizes",
-    "legalising": "legalizing",
-    "legitimise": "legitimize",
-    "legitimised": "legitimized",
-    "legitimises": "legitimizes",
-    "legitimising": "legitimizing",
-    "leukaemia": "leukemia",
-    "levelled": "leveled",
-    "leveller": "leveler",
-    "levellers": "levelers",
-    "levelling": "leveling",
-    "libelled": "libeled",
-    "libelling": "libeling",
-    "libellous": "libelous",
-    "liberalisation": "liberalization",
-    "liberalise": "liberalize",
-    "liberalised": "liberalized",
-    "liberalises": "liberalizes",
-    "liberalising": "liberalizing",
-    "licence": "license",
-    "licenced": "licensed",
-    "licences": "licenses",
-    "licencing": "licensing",
-    "likeable": "likable",
-    "lionisation": "lionization",
-    "lionise": "lionize",
-    "lionised": "lionized",
-    "lionises": "lionizes",
-    "lionising": "lionizing",
-    "liquidise": "liquidize",
-    "liquidised": "liquidized",
-    "liquidiser": "liquidizer",
-    "liquidisers": "liquidizers",
-    "liquidises": "liquidizes",
-    "liquidising": "liquidizing",
-    "litre": "liter",
-    "litres": "liters",
-    "localise": "localize",
-    "localised": "localized",
-    "localises": "localizes",
-    "localising": "localizing",
-    "louvre": "louver",
-    "louvred": "louvered",
-    "louvres": "louvers",
-    "lustre": "luster",
-    "magnetise": "magnetize",
-    "magnetised": "magnetized",
-    "magnetises": "magnetizes",
-    "magnetising": "magnetizing",
-    "manoeuvrability": "maneuverability",
-    "manoeuvrable": "maneuverable",
-    "manoeuvre": "maneuver",
-    "manoeuvred": "maneuvered",
-    "manoeuvres": "maneuvers",
-    "manoeuvring": "maneuvering",
-    "manoeuvrings": "maneuverings",
-    "marginalisation": "marginalization",
-    "marginalise": "marginalize",
-    "marginalised": "marginalized",
-    "marginalises": "marginalizes",
-    "marginalising": "marginalizing",
-    "marshalled": "marshaled",
-    "marshalling": "marshaling",
-    "marvelled": "marveled",
-    "marvelling": "marveling",
-    "marvellous": "marvelous",
-    "marvellously": "marvelously",
-    "materialisation": "materialization",
-    "materialise": "materialize",
-    "materialised": "materialized",
-    "materialises": "materializes",
-    "materialising": "materializing",
-    "maximisation": "maximization",
-    "maximise": "maximize",
-    "maximised": "maximized",
-    "maximises": "maximizes",
-    "maximising": "maximizing",
-    "meagre": "meager",
-    "mechanisation": "mechanization",
-    "mechanise": "mechanize",
-    "mechanised": "mechanized",
-    "mechanises": "mechanizes",
-    "mechanising": "mechanizing",
-    "mediaeval": "medieval",
-    "memorialise": "memorialize",
-    "memorialised": "memorialized",
-    "memorialises": "memorializes",
-    "memorialising": "memorializing",
-    "memorise": "memorize",
-    "memorised": "memorized",
-    "memorises": "memorizes",
-    "memorising": "memorizing",
-    "mesmerise": "mesmerize",
-    "mesmerised": "mesmerized",
-    "mesmerises": "mesmerizes",
-    "mesmerising": "mesmerizing",
-    "metabolise": "metabolize",
-    "metabolised": "metabolized",
-    "metabolises": "metabolizes",
-    "metabolising": "metabolizing",
-    "metre": "meter",
-    "metres": "meters",
-    "micrometre": "micrometer",
-    "micrometres": "micrometers",
-    "militarise": "militarize",
-    "militarised": "militarized",
-    "militarises": "militarizes",
-    "militarising": "militarizing",
-    "milligramme": "milligram",
-    "milligrammes": "milligrams",
-    "millilitre": "milliliter",
-    "millilitres": "milliliters",
-    "millimetre": "millimeter",
-    "millimetres": "millimeters",
-    "miniaturisation": "miniaturization",
-    "miniaturise": "miniaturize",
-    "miniaturised": "miniaturized",
-    "miniaturises": "miniaturizes",
-    "miniaturising": "miniaturizing",
-    "minibusses": "minibuses",
-    "minimise": "minimize",
-    "minimised": "minimized",
-    "minimises": "minimizes",
-    "minimising": "minimizing",
-    "misbehaviour": "misbehavior",
-    "misdemeanour": "misdemeanor",
-    "misdemeanours": "misdemeanors",
-    "misspelt": "misspelled",
-    "mitre": "miter",
-    "mitres": "miters",
-    "mobilisation": "mobilization",
-    "mobilise": "mobilize",
-    "mobilised": "mobilized",
-    "mobilises": "mobilizes",
-    "mobilising": "mobilizing",
-    "modelled": "modeled",
-    "modeller": "modeler",
-    "modellers": "modelers",
-    "modelling": "modeling",
-    "modernise": "modernize",
-    "modernised": "modernized",
-    "modernises": "modernizes",
-    "modernising": "modernizing",
-    "moisturise": "moisturize",
-    "moisturised": "moisturized",
-    "moisturiser": "moisturizer",
-    "moisturisers": "moisturizers",
-    "moisturises": "moisturizes",
-    "moisturising": "moisturizing",
-    "monologue": "monolog",
-    "monologues": "monologs",
-    "monopolisation": "monopolization",
-    "monopolise": "monopolize",
-    "monopolised": "monopolized",
-    "monopolises": "monopolizes",
-    "monopolising": "monopolizing",
-    "moralise": "moralize",
-    "moralised": "moralized",
-    "moralises": "moralizes",
-    "moralising": "moralizing",
-    "motorised": "motorized",
-    "mould": "mold",
-    "moulded": "molded",
-    "moulder": "molder",
-    "mouldered": "moldered",
-    "mouldering": "moldering",
-    "moulders": "molders",
-    "mouldier": "moldier",
-    "mouldiest": "moldiest",
-    "moulding": "molding",
-    "mouldings": "moldings",
-    "moulds": "molds",
-    "mouldy": "moldy",
-    "moult": "molt",
-    "moulted": "molted",
-    "moulting": "molting",
-    "moults": "molts",
-    "moustache": "mustache",
-    "moustached": "mustached",
-    "moustaches": "mustaches",
-    "moustachioed": "mustachioed",
-    "multicoloured": "multicolored",
-    "nationalisation": "nationalization",
-    "nationalisations": "nationalizations",
-    "nationalise": "nationalize",
-    "nationalised": "nationalized",
-    "nationalises": "nationalizes",
-    "nationalising": "nationalizing",
-    "naturalisation": "naturalization",
-    "naturalise": "naturalize",
-    "naturalised": "naturalized",
-    "naturalises": "naturalizes",
-    "naturalising": "naturalizing",
-    "neighbour": "neighbor",
-    "neighbourhood": "neighborhood",
-    "neighbourhoods": "neighborhoods",
-    "neighbouring": "neighboring",
-    "neighbourliness": "neighborliness",
-    "neighbourly": "neighborly",
-    "neighbours": "neighbors",
-    "neutralisation": "neutralization",
-    "neutralise": "neutralize",
-    "neutralised": "neutralized",
-    "neutralises": "neutralizes",
-    "neutralising": "neutralizing",
-    "normalisation": "normalization",
-    "normalise": "normalize",
-    "normalised": "normalized",
-    "normalises": "normalizes",
-    "normalising": "normalizing",
-    "odour": "odor",
-    "odourless": "odorless",
-    "odours": "odors",
-    "oesophagus": "esophagus",
-    "oesophaguses": "esophaguses",
-    "oestrogen": "estrogen",
-    "offence": "offense",
-    "offences": "offenses",
-    "omelette": "omelet",
-    "omelettes": "omelets",
-    "optimise": "optimize",
-    "optimised": "optimized",
-    "optimises": "optimizes",
-    "optimising": "optimizing",
-    "organisation": "organization",
-    "organisational": "organizational",
-    "organisations": "organizations",
-    "organise": "organize",
-    "organised": "organized",
-    "organiser": "organizer",
-    "organisers": "organizers",
-    "organises": "organizes",
-    "organising": "organizing",
-    "orthopaedic": "orthopedic",
-    "orthopaedics": "orthopedics",
-    "ostracise": "ostracize",
-    "ostracised": "ostracized",
-    "ostracises": "ostracizes",
-    "ostracising": "ostracizing",
-    "outmanoeuvre": "outmaneuver",
-    "outmanoeuvred": "outmaneuvered",
-    "outmanoeuvres": "outmaneuvers",
-    "outmanoeuvring": "outmaneuvering",
-    "overemphasise": "overemphasize",
-    "overemphasised": "overemphasized",
-    "overemphasises": "overemphasizes",
-    "overemphasising": "overemphasizing",
-    "oxidisation": "oxidization",
-    "oxidise": "oxidize",
-    "oxidised": "oxidized",
-    "oxidises": "oxidizes",
-    "oxidising": "oxidizing",
-    "paederast": "pederast",
-    "paederasts": "pederasts",
-    "paediatric": "pediatric",
-    "paediatrician": "pediatrician",
-    "paediatricians": "pediatricians",
-    "paediatrics": "pediatrics",
-    "paedophile": "pedophile",
-    "paedophiles": "pedophiles",
-    "paedophilia": "pedophilia",
-    "palaeolithic": "paleolithic",
-    "palaeontologist": "paleontologist",
-    "palaeontologists": "paleontologists",
-    "palaeontology": "paleontology",
-    "panelled": "paneled",
-    "panelling": "paneling",
-    "panellist": "panelist",
-    "panellists": "panelists",
-    "paralyse": "paralyze",
-    "paralysed": "paralyzed",
-    "paralyses": "paralyzes",
-    "paralysing": "paralyzing",
-    "parcelled": "parceled",
-    "parcelling": "parceling",
-    "parlour": "parlor",
-    "parlours": "parlors",
-    "particularise": "particularize",
-    "particularised": "particularized",
-    "particularises": "particularizes",
-    "particularising": "particularizing",
-    "passivisation": "passivization",
-    "passivise": "passivize",
-    "passivised": "passivized",
-    "passivises": "passivizes",
-    "passivising": "passivizing",
-    "pasteurisation": "pasteurization",
-    "pasteurise": "pasteurize",
-    "pasteurised": "pasteurized",
-    "pasteurises": "pasteurizes",
-    "pasteurising": "pasteurizing",
-    "patronise": "patronize",
-    "patronised": "patronized",
-    "patronises": "patronizes",
-    "patronising": "patronizing",
-    "patronisingly": "patronizingly",
-    "pedalled": "pedaled",
-    "pedalling": "pedaling",
-    "pedestrianisation": "pedestrianization",
-    "pedestrianise": "pedestrianize",
-    "pedestrianised": "pedestrianized",
-    "pedestrianises": "pedestrianizes",
-    "pedestrianising": "pedestrianizing",
-    "penalise": "penalize",
-    "penalised": "penalized",
-    "penalises": "penalizes",
-    "penalising": "penalizing",
-    "pencilled": "penciled",
-    "pencilling": "penciling",
-    "personalise": "personalize",
-    "personalised": "personalized",
-    "personalises": "personalizes",
-    "personalising": "personalizing",
-    "pharmacopoeia": "pharmacopeia",
-    "pharmacopoeias": "pharmacopeias",
-    "philosophise": "philosophize",
-    "philosophised": "philosophized",
-    "philosophises": "philosophizes",
-    "philosophising": "philosophizing",
-    "philtre": "filter",
-    "philtres": "filters",
-    "phoney": "phony",
-    "plagiarise": "plagiarize",
-    "plagiarised": "plagiarized",
-    "plagiarises": "plagiarizes",
-    "plagiarising": "plagiarizing",
-    "plough": "plow",
-    "ploughed": "plowed",
-    "ploughing": "plowing",
-    "ploughman": "plowman",
-    "ploughmen": "plowmen",
-    "ploughs": "plows",
-    "ploughshare": "plowshare",
-    "ploughshares": "plowshares",
-    "polarisation": "polarization",
-    "polarise": "polarize",
-    "polarised": "polarized",
-    "polarises": "polarizes",
-    "polarising": "polarizing",
-    "politicisation": "politicization",
-    "politicise": "politicize",
-    "politicised": "politicized",
-    "politicises": "politicizes",
-    "politicising": "politicizing",
-    "popularisation": "popularization",
-    "popularise": "popularize",
-    "popularised": "popularized",
-    "popularises": "popularizes",
-    "popularising": "popularizing",
-    "pouffe": "pouf",
-    "pouffes": "poufs",
-    "practise": "practice",
-    "practised": "practiced",
-    "practises": "practices",
-    "practising": "practicing",
-    "praesidium": "presidium",
-    "praesidiums": "presidiums",
-    "pressurisation": "pressurization",
-    "pressurise": "pressurize",
-    "pressurised": "pressurized",
-    "pressurises": "pressurizes",
-    "pressurising": "pressurizing",
-    "pretence": "pretense",
-    "pretences": "pretenses",
-    "primaeval": "primeval",
-    "prioritisation": "prioritization",
-    "prioritise": "prioritize",
-    "prioritised": "prioritized",
-    "prioritises": "prioritizes",
-    "prioritising": "prioritizing",
-    "privatisation": "privatization",
-    "privatisations": "privatizations",
-    "privatise": "privatize",
-    "privatised": "privatized",
-    "privatises": "privatizes",
-    "privatising": "privatizing",
-    "professionalisation": "professionalization",
-    "professionalise": "professionalize",
-    "professionalised": "professionalized",
-    "professionalises": "professionalizes",
-    "professionalising": "professionalizing",
-    "programme": "program",
-    "programmes": "programs",
-    "prologue": "prolog",
-    "prologues": "prologs",
-    "propagandise": "propagandize",
-    "propagandised": "propagandized",
-    "propagandises": "propagandizes",
-    "propagandising": "propagandizing",
-    "proselytise": "proselytize",
-    "proselytised": "proselytized",
-    "proselytiser": "proselytizer",
-    "proselytisers": "proselytizers",
-    "proselytises": "proselytizes",
-    "proselytising": "proselytizing",
-    "psychoanalyse": "psychoanalyze",
-    "psychoanalysed": "psychoanalyzed",
-    "psychoanalyses": "psychoanalyzes",
-    "psychoanalysing": "psychoanalyzing",
-    "publicise": "publicize",
-    "publicised": "publicized",
-    "publicises": "publicizes",
-    "publicising": "publicizing",
-    "pulverisation": "pulverization",
-    "pulverise": "pulverize",
-    "pulverised": "pulverized",
-    "pulverises": "pulverizes",
-    "pulverising": "pulverizing",
-    "pummelled": "pummel",
-    "pummelling": "pummeled",
-    "pyjama": "pajama",
-    "pyjamas": "pajamas",
-    "pzazz": "pizzazz",
-    "quarrelled": "quarreled",
-    "quarrelling": "quarreling",
-    "radicalise": "radicalize",
-    "radicalised": "radicalized",
-    "radicalises": "radicalizes",
-    "radicalising": "radicalizing",
-    "rancour": "rancor",
-    "randomise": "randomize",
-    "randomised": "randomized",
-    "randomises": "randomizes",
-    "randomising": "randomizing",
-    "rationalisation": "rationalization",
-    "rationalisations": "rationalizations",
-    "rationalise": "rationalize",
-    "rationalised": "rationalized",
-    "rationalises": "rationalizes",
-    "rationalising": "rationalizing",
-    "ravelled": "raveled",
-    "ravelling": "raveling",
-    "realisable": "realizable",
-    "realisation": "realization",
-    "realisations": "realizations",
-    "realise": "realize",
-    "realised": "realized",
-    "realises": "realizes",
-    "realising": "realizing",
-    "recognisable": "recognizable",
-    "recognisably": "recognizably",
-    "recognisance": "recognizance",
-    "recognise": "recognize",
-    "recognised": "recognized",
-    "recognises": "recognizes",
-    "recognising": "recognizing",
-    "reconnoitre": "reconnoiter",
-    "reconnoitred": "reconnoitered",
-    "reconnoitres": "reconnoiters",
-    "reconnoitring": "reconnoitering",
-    "refuelled": "refueled",
-    "refuelling": "refueling",
-    "regularisation": "regularization",
-    "regularise": "regularize",
-    "regularised": "regularized",
-    "regularises": "regularizes",
-    "regularising": "regularizing",
-    "remodelled": "remodeled",
-    "remodelling": "remodeling",
-    "remould": "remold",
-    "remoulded": "remolded",
-    "remoulding": "remolding",
-    "remoulds": "remolds",
-    "reorganisation": "reorganization",
-    "reorganisations": "reorganizations",
-    "reorganise": "reorganize",
-    "reorganised": "reorganized",
-    "reorganises": "reorganizes",
-    "reorganising": "reorganizing",
-    "revelled": "reveled",
-    "reveller": "reveler",
-    "revellers": "revelers",
-    "revelling": "reveling",
-    "revitalise": "revitalize",
-    "revitalised": "revitalized",
-    "revitalises": "revitalizes",
-    "revitalising": "revitalizing",
-    "revolutionise": "revolutionize",
-    "revolutionised": "revolutionized",
-    "revolutionises": "revolutionizes",
-    "revolutionising": "revolutionizing",
-    "rhapsodise": "rhapsodize",
-    "rhapsodised": "rhapsodized",
-    "rhapsodises": "rhapsodizes",
-    "rhapsodising": "rhapsodizing",
-    "rigour": "rigor",
-    "rigours": "rigors",
-    "ritualised": "ritualized",
-    "rivalled": "rivaled",
-    "rivalling": "rivaling",
-    "romanticise": "romanticize",
-    "romanticised": "romanticized",
-    "romanticises": "romanticizes",
-    "romanticising": "romanticizing",
-    "rumour": "rumor",
-    "rumoured": "rumored",
-    "rumours": "rumors",
-    "sabre": "saber",
-    "sabres": "sabers",
-    "saltpetre": "saltpeter",
-    "sanitise": "sanitize",
-    "sanitised": "sanitized",
-    "sanitises": "sanitizes",
-    "sanitising": "sanitizing",
-    "satirise": "satirize",
-    "satirised": "satirized",
-    "satirises": "satirizes",
-    "satirising": "satirizing",
-    "saviour": "savior",
-    "saviours": "saviors",
-    "savour": "savor",
-    "savoured": "savored",
-    "savouries": "savories",
-    "savouring": "savoring",
-    "savours": "savors",
-    "savoury": "savory",
-    "scandalise": "scandalize",
-    "scandalised": "scandalized",
-    "scandalises": "scandalizes",
-    "scandalising": "scandalizing",
-    "sceptic": "skeptic",
-    "sceptical": "skeptical",
-    "sceptically": "skeptically",
-    "scepticism": "skepticism",
-    "sceptics": "skeptics",
-    "sceptre": "scepter",
-    "sceptres": "scepters",
-    "scrutinise": "scrutinize",
-    "scrutinised": "scrutinized",
-    "scrutinises": "scrutinizes",
-    "scrutinising": "scrutinizing",
-    "secularisation": "secularization",
-    "secularise": "secularize",
-    "secularised": "secularized",
-    "secularises": "secularizes",
-    "secularising": "secularizing",
-    "sensationalise": "sensationalize",
-    "sensationalised": "sensationalized",
-    "sensationalises": "sensationalizes",
-    "sensationalising": "sensationalizing",
-    "sensitise": "sensitize",
-    "sensitised": "sensitized",
-    "sensitises": "sensitizes",
-    "sensitising": "sensitizing",
-    "sentimentalise": "sentimentalize",
-    "sentimentalised": "sentimentalized",
-    "sentimentalises": "sentimentalizes",
-    "sentimentalising": "sentimentalizing",
-    "sepulchre": "sepulcher",
-    "sepulchres": "sepulchers",
-    "serialisation": "serialization",
-    "serialisations": "serializations",
-    "serialise": "serialize",
-    "serialised": "serialized",
-    "serialises": "serializes",
-    "serialising": "serializing",
-    "sermonise": "sermonize",
-    "sermonised": "sermonized",
-    "sermonises": "sermonizes",
-    "sermonising": "sermonizing",
-    "sheikh": "sheik",
-    "shovelled": "shoveled",
-    "shovelling": "shoveling",
-    "shrivelled": "shriveled",
-    "shrivelling": "shriveling",
-    "signalise": "signalize",
-    "signalised": "signalized",
-    "signalises": "signalizes",
-    "signalising": "signalizing",
-    "signalled": "signaled",
-    "signalling": "signaling",
-    "smoulder": "smolder",
-    "smouldered": "smoldered",
-    "smouldering": "smoldering",
-    "smoulders": "smolders",
-    "snivelled": "sniveled",
-    "snivelling": "sniveling",
-    "snorkelled": "snorkeled",
-    "snorkelling": "snorkeling",
-    "snowplough": "snowplow",
-    "snowploughs": "snowplow",
-    "socialisation": "socialization",
-    "socialise": "socialize",
-    "socialised": "socialized",
-    "socialises": "socializes",
-    "socialising": "socializing",
-    "sodomise": "sodomize",
-    "sodomised": "sodomized",
-    "sodomises": "sodomizes",
-    "sodomising": "sodomizing",
-    "solemnise": "solemnize",
-    "solemnised": "solemnized",
-    "solemnises": "solemnizes",
-    "solemnising": "solemnizing",
-    "sombre": "somber",
-    "specialisation": "specialization",
-    "specialisations": "specializations",
-    "specialise": "specialize",
-    "specialised": "specialized",
-    "specialises": "specializes",
-    "specialising": "specializing",
-    "spectre": "specter",
-    "spectres": "specters",
-    "spiralled": "spiraled",
-    "spiralling": "spiraling",
-    "splendour": "splendor",
-    "splendours": "splendors",
-    "squirrelled": "squirreled",
-    "squirrelling": "squirreling",
-    "stabilisation": "stabilization",
-    "stabilise": "stabilize",
-    "stabilised": "stabilized",
-    "stabiliser": "stabilizer",
-    "stabilisers": "stabilizers",
-    "stabilises": "stabilizes",
-    "stabilising": "stabilizing",
-    "standardisation": "standardization",
-    "standardise": "standardize",
-    "standardised": "standardized",
-    "standardises": "standardizes",
-    "standardising": "standardizing",
-    "stencilled": "stenciled",
-    "stencilling": "stenciling",
-    "sterilisation": "sterilization",
-    "sterilisations": "sterilizations",
-    "sterilise": "sterilize",
-    "sterilised": "sterilized",
-    "steriliser": "sterilizer",
-    "sterilisers": "sterilizers",
-    "sterilises": "sterilizes",
-    "sterilising": "sterilizing",
-    "stigmatisation": "stigmatization",
-    "stigmatise": "stigmatize",
-    "stigmatised": "stigmatized",
-    "stigmatises": "stigmatizes",
-    "stigmatising": "stigmatizing",
-    "storey": "story",
-    "storeys": "stories",
-    "subsidisation": "subsidization",
-    "subsidise": "subsidize",
-    "subsidised": "subsidized",
-    "subsidiser": "subsidizer",
-    "subsidisers": "subsidizers",
-    "subsidises": "subsidizes",
-    "subsidising": "subsidizing",
-    "succour": "succor",
-    "succoured": "succored",
-    "succouring": "succoring",
-    "succours": "succors",
-    "sulphate": "sulfate",
-    "sulphates": "sulfates",
-    "sulphide": "sulfide",
-    "sulphides": "sulfides",
-    "sulphur": "sulfur",
-    "sulphurous": "sulfurous",
-    "summarise": "summarize",
-    "summarised": "summarized",
-    "summarises": "summarizes",
-    "summarising": "summarizing",
-    "swivelled": "swiveled",
-    "swivelling": "swiveling",
-    "symbolise": "symbolize",
-    "symbolised": "symbolized",
-    "symbolises": "symbolizes",
-    "symbolising": "symbolizing",
-    "sympathise": "sympathize",
-    "sympathised": "sympathized",
-    "sympathiser": "sympathizer",
-    "sympathisers": "sympathizers",
-    "sympathises": "sympathizes",
-    "sympathising": "sympathizing",
-    "synchronisation": "synchronization",
-    "synchronise": "synchronize",
-    "synchronised": "synchronized",
-    "synchronises": "synchronizes",
-    "synchronising": "synchronizing",
-    "synthesise": "synthesize",
-    "synthesised": "synthesized",
-    "synthesiser": "synthesizer",
-    "synthesisers": "synthesizers",
-    "synthesises": "synthesizes",
-    "synthesising": "synthesizing",
-    "syphon": "siphon",
-    "syphoned": "siphoned",
-    "syphoning": "siphoning",
-    "syphons": "siphons",
-    "systematisation": "systematization",
-    "systematise": "systematize",
-    "systematised": "systematized",
-    "systematises": "systematizes",
-    "systematising": "systematizing",
-    "tantalise": "tantalize",
-    "tantalised": "tantalized",
-    "tantalises": "tantalizes",
-    "tantalising": "tantalizing",
-    "tantalisingly": "tantalizingly",
-    "tasselled": "tasseled",
-    "technicolour": "technicolor",
-    "temporise": "temporize",
-    "temporised": "temporized",
-    "temporises": "temporizes",
-    "temporising": "temporizing",
-    "tenderise": "tenderize",
-    "tenderised": "tenderized",
-    "tenderises": "tenderizes",
-    "tenderising": "tenderizing",
-    "terrorise": "terrorize",
-    "terrorised": "terrorized",
-    "terrorises": "terrorizes",
-    "terrorising": "terrorizing",
-    "theatre": "theater",
-    "theatregoer": "theatergoer",
-    "theatregoers": "theatergoers",
-    "theatres": "theaters",
-    "theorise": "theorize",
-    "theorised": "theorized",
-    "theorises": "theorizes",
-    "theorising": "theorizing",
-    "tonne": "ton",
-    "tonnes": "tons",
-    "towelled": "toweled",
-    "towelling": "toweling",
-    "toxaemia": "toxemia",
-    "tranquillise": "tranquilize",
-    "tranquillised": "tranquilized",
-    "tranquilliser": "tranquilizer",
-    "tranquillisers": "tranquilizers",
-    "tranquillises": "tranquilizes",
-    "tranquillising": "tranquilizing",
-    "tranquillity": "tranquility",
-    "tranquillize": "tranquilize",
-    "tranquillized": "tranquilized",
-    "tranquillizer": "tranquilizer",
-    "tranquillizers": "tranquilizers",
-    "tranquillizes": "tranquilizes",
-    "tranquillizing": "tranquilizing",
-    "tranquilly": "tranquility",
-    "transistorised": "transistorized",
-    "traumatise": "traumatize",
-    "traumatised": "traumatized",
-    "traumatises": "traumatizes",
-    "traumatising": "traumatizing",
-    "travelled": "traveled",
-    "traveller": "traveler",
-    "travellers": "travelers",
-    "travelling": "traveling",
-    "travelog": "travelogue",
-    "travelogs": "travelogues",
-    "trialled": "trialed",
-    "trialling": "trialing",
-    "tricolour": "tricolor",
-    "tricolours": "tricolors",
-    "trivialise": "trivialize",
-    "trivialised": "trivialized",
-    "trivialises": "trivializes",
-    "trivialising": "trivializing",
-    "tumour": "tumor",
-    "tumours": "tumors",
-    "tunnelled": "tunneled",
-    "tunnelling": "tunneling",
-    "tyrannise": "tyrannize",
-    "tyrannised": "tyrannized",
-    "tyrannises": "tyrannizes",
-    "tyrannising": "tyrannizing",
-    "tyre": "tire",
-    "tyres": "tires",
-    "unauthorised": "unauthorized",
-    "uncivilised": "uncivilized",
-    "underutilised": "underutilized",
-    "unequalled": "unequaled",
-    "unfavourable": "unfavorable",
-    "unfavourably": "unfavorably",
-    "unionisation": "unionization",
-    "unionise": "unionize",
-    "unionised": "unionized",
-    "unionises": "unionizes",
-    "unionising": "unionizing",
-    "unorganised": "unorganized",
-    "unravelled": "unraveled",
-    "unravelling": "unraveling",
-    "unrecognisable": "unrecognizable",
-    "unrecognised": "unrecognized",
-    "unrivalled": "unrivaled",
-    "unsavoury": "unsavory",
-    "untrammelled": "untrammeled",
-    "urbanisation": "urbanization",
-    "urbanise": "urbanize",
-    "urbanised": "urbanized",
-    "urbanises": "urbanizes",
-    "urbanising": "urbanizing",
-    "utilisable": "utilizable",
-    "utilisation": "utilization",
-    "utilise": "utilize",
-    "utilised": "utilized",
-    "utilises": "utilizes",
-    "utilising": "utilizing",
-    "valour": "valor",
-    "vandalise": "vandalize",
-    "vandalised": "vandalized",
-    "vandalises": "vandalizes",
-    "vandalising": "vandalizing",
-    "vaporisation": "vaporization",
-    "vaporise": "vaporize",
-    "vaporised": "vaporized",
-    "vaporises": "vaporizes",
-    "vaporising": "vaporizing",
-    "vapour": "vapor",
-    "vapours": "vapors",
-    "verbalise": "verbalize",
-    "verbalised": "verbalized",
-    "verbalises": "verbalizes",
-    "verbalising": "verbalizing",
-    "victimisation": "victimization",
-    "victimise": "victimize",
-    "victimised": "victimized",
-    "victimises": "victimizes",
-    "victimising": "victimizing",
-    "videodisc": "videodisk",
-    "videodiscs": "videodisks",
-    "vigour": "vigor",
-    "visualisation": "visualization",
-    "visualisations": "visualizations",
-    "visualise": "visualize",
-    "visualised": "visualized",
-    "visualises": "visualizes",
-    "visualising": "visualizing",
-    "vocalisation": "vocalization",
-    "vocalisations": "vocalizations",
-    "vocalise": "vocalize",
-    "vocalised": "vocalized",
-    "vocalises": "vocalizes",
-    "vocalising": "vocalizing",
-    "vulcanised": "vulcanized",
-    "vulgarisation": "vulgarization",
-    "vulgarise": "vulgarize",
-    "vulgarised": "vulgarized",
-    "vulgarises": "vulgarizes",
-    "vulgarising": "vulgarizing",
-    "waggon": "wagon",
-    "waggons": "wagons",
-    "watercolour": "watercolor",
-    "watercolours": "watercolors",
-    "weaselled": "weaseled",
-    "weaselling": "weaseling",
-    "westernisation": "westernization",
-    "westernise": "westernize",
-    "westernised": "westernized",
-    "westernises": "westernizes",
-    "westernising": "westernizing",
-    "womanise": "womanize",
-    "womanised": "womanized",
-    "womaniser": "womanizer",
-    "womanisers": "womanizers",
-    "womanises": "womanizes",
-    "womanising": "womanizing",
-    "woollen": "woolen",
-    "woollens": "woolens",
-    "woollies": "woolies",
-    "woolly": "wooly",
-    "worshipped": "worshiped",
-    "worshipping": "worshiping",
-    "worshipper": "worshiper",
-    "yodelled": "yodeled",
-    "yodelling": "yodeling",
-    "yoghourt": "yogurt",
-    "yoghourts": "yogurts",
-    "yoghurt": "yogurt",
-    "yoghurts": "yogurts",
-    "mhm": "hmm",
-    "mm": "hmm",
-    "mmm": "hmm",
-}
-
-
 class EnglishSpellingNormalizer:
     """
     Applies British-American spelling mappings as listed in [1].
@@ -2251,15 +507,15 @@ class EnglishSpellingNormalizer:
     [1] https://www.tysto.com/uk-us-spelling-list.html
     """
 
-    def __init__(self):
-        self.mapping = ENGLISH_MAPPING
+    def __init__(self, english_spelling_mapping):
+        self.mapping = english_spelling_mapping
 
     def __call__(self, s: str):
         return " ".join(self.mapping.get(word, word) for word in s.split())
 
 
 class EnglishTextNormalizer:
-    def __init__(self):
+    def __init__(self, english_spelling_mapping):
         self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
         self.replacers = {
             # common contractions
@@ -2317,7 +573,7 @@ def __init__(self):
             r"'m\b": " am",
         }
         self.standardize_numbers = EnglishNumberNormalizer()
-        self.standardize_spellings = EnglishSpellingNormalizer()
+        self.standardize_spellings = EnglishSpellingNormalizer(english_spelling_mapping)
 
     def __call__(self, s: str):
         s = s.lower()
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index ed8aada73bda9..d322535b87ef4 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -43,12 +43,14 @@ def __init__(self, feature_extractor, tokenizer):
 
     def _get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         forced_decoder_tokens = ""
-        
+
         if task is not None:
             if f"<|{task}|>" not in self.tokenizer.additional_special_tokens:
-                raise ValueError(f"'{task}' is not supported. The language should be in : {{'transcribe', 'translate'}}")
+                raise ValueError(
+                    f"'{task}' is not supported. The language should be in : {{'transcribe', 'translate'}}"
+                )
             forced_decoder_tokens += f"<|{task}|>"
-            
+
         if language is not None:
             if f"<|{language}|>" not in self.tokenizer.additional_special_tokens:
                 raise ValueError(
@@ -69,7 +71,7 @@ def _get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
 
         forced_decoder_tokens += "<|notimestamps|>" if no_timestamps else ""
         ids = self.tokenizer.encode(forced_decoder_tokens)
-        forced_decoder_ids = [ (rank +1 ,token) for rank,token in enumerate(ids)]
+        forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(ids)]
         return forced_decoder_ids
 
     def __call__(self, *args, **kwargs):
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index dc9b45d974850..a319546dc34a1 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -24,13 +24,21 @@
 from .english_normalizer import EnglishTextNormalizer
 
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "tokenizer_file": "tokenizer.json", "merges_file": "merges.txt"}
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_file": "tokenizer.json",
+    "merges_file": "merges.txt",
+    "normalizer_file": "normalizer.json",
+}
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
         "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/vocab.json",
     },
     "merges_file": {"openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/merges_file.txt"},
+    "normalizer_file": {
+        "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/normalizer.json"
+    },
 }
 
 MAX_MODEL_INPUT_SIZES = {
@@ -117,6 +125,7 @@ def __init__(
         self,
         vocab_file,
         merges_file,
+        normalizer_file=None,
         task=None,
         language="en",
         errors="replace",
@@ -160,6 +169,10 @@ def __init__(
         self.cache = {}
         self.add_prefix_space = add_prefix_space
 
+        if normalizer_file is not None:
+            with open(normalizer_file, encoding="utf-8") as vocab_handle:
+                self.english_spelling_normalizer = json.load(vocab_handle)
+
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
@@ -287,7 +300,7 @@ def _normalize(self, text):
         Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
         english text.
         """
-        normalizer = EnglishTextNormalizer()
+        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
         return normalizer(text)
 
     def _decode(

From ff8aa6c2248d9e6e7a40c89909bc653f57d022b0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Tue, 4 Oct 2022 21:45:45 +0000
Subject: [PATCH 151/156]  remove protected

---
 src/transformers/models/whisper/processing_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index d322535b87ef4..4b5d33e4fa78f 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -41,7 +41,7 @@ def __init__(self, feature_extractor, tokenizer):
         self.current_processor = self.feature_extractor
         self._in_target_context_manager = False
 
-    def _get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         forced_decoder_tokens = ""
 
         if task is not None:

From 40461a9cd8282121abd556bd7f757b6d793dafcd Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 5 Oct 2022 12:17:37 +0000
Subject: [PATCH 152/156] update

---
 src/transformers/generation_logits_process.py |  4 +-
 .../models/whisper/configuration_whisper.py   | 23 +++--
 .../models/whisper/processing_whisper.py      | 14 +--
 .../models/whisper/tokenization_whisper.py    | 14 ++-
 tests/models/whisper/test_modeling_whisper.py | 97 +++++++------------
 5 files changed, 68 insertions(+), 84 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 8a4402a589c98..f3038a400b49a 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -735,8 +735,8 @@ def __call__(self, input_ids, scores):
 
 
 class ForceTokensLogitsProcessor(LogitsProcessor):
-    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they
-    are not sampled."""
+    r"""This processor can be used to force a list of tokens. The processor will set their log probs to `inf` so that they
+    are sampled at their corresponding index."""
 
     def __init__(self, force_token_map):
         self.force_token_map = dict(force_token_map)
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index d4242ab34818a..1874f0f06b72d 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -34,19 +34,18 @@
     4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
     11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
     17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
-    34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360
+    34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360, 50361
 ]
-NON_SPEECH_TOKENS_MULTI = [
-    1, 2, 6, 7, 8, 9,
-    10, 12, 14, 25, 26, 27, 28, 29, 31, 58,
-    59, 60, 61, 62, 63, 90, 91, 92, 93, 359,
-    503, 522, 542, 873, 893, 902, 918, 922, 931, 1350,
-    1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961,
-    4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938,
-    12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604,
-    18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464,
-    31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50257, 50360,
-    50359
+NON_SPEECH_TOKENS_MULTI = [    
+    1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
+    893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
+    3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
+    7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
+    14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
+    22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
+    42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362
 ]
 # fmt: on
 
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 4b5d33e4fa78f..81c3e2792aa67 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -44,13 +44,6 @@ def __init__(self, feature_extractor, tokenizer):
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         forced_decoder_tokens = ""
 
-        if task is not None:
-            if f"<|{task}|>" not in self.tokenizer.additional_special_tokens:
-                raise ValueError(
-                    f"'{task}' is not supported. The language should be in : {{'transcribe', 'translate'}}"
-                )
-            forced_decoder_tokens += f"<|{task}|>"
-
         if language is not None:
             if f"<|{language}|>" not in self.tokenizer.additional_special_tokens:
                 raise ValueError(
@@ -68,6 +61,13 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
                     " '<|as|>', '<|tt|>', '<|haw|>', '<|ln|>', '<|ha|>', '<|ba|>', '<|jw|>', '<|su|>'"
                 )
             forced_decoder_tokens += f"<|{language}|>"
+        
+        if task is not None:
+            if f"<|{task}|>" not in self.tokenizer.additional_special_tokens:
+                raise ValueError(
+                    f"'{task}' is not supported. The language should be in : {{'transcribe', 'translate'}}"
+                )
+            forced_decoder_tokens += f"<|{task}|>"
 
         forced_decoder_tokens += "<|notimestamps|>" if no_timestamps else ""
         ids = self.tokenizer.encode(forced_decoder_tokens)
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index a319546dc34a1..bb3a55931dd19 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -172,6 +172,8 @@ def __init__(
         if normalizer_file is not None:
             with open(normalizer_file, encoding="utf-8") as vocab_handle:
                 self.english_spelling_normalizer = json.load(vocab_handle)
+        else:
+            self.english_spelling_normalizer = None
 
         # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
@@ -343,7 +345,6 @@ def convert_tokens_to_string(self, tokens):
         text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
         return text
 
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary with GPT2 -> Whisper
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
@@ -354,6 +355,9 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         merge_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
         )
+        normalizer_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
+        )
 
         with open(vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
@@ -371,7 +375,13 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
                 writer.write(" ".join(bpe_tokens) + "\n")
                 index += 1
 
-        return vocab_file, merge_file
+        if self.english_spelling_normalizer is not None:
+            with open(normalizer_file, "w", encoding="utf-8") as f:
+                f.write(
+                    json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+                )
+
+        return vocab_file, merge_file, normalizer_file
 
     # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.prepare_for_tokenization with GPT2 -> Whisper
     def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index b15b589952479..395dadc321e87 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -21,7 +21,6 @@
 import unittest
 
 from transformers import WhisperConfig
-from transformers.models.whisper.configuration_whisper import NON_SPEECH_TOKENS_MULTI
 from transformers.testing_utils import is_torch_available, require_torch, require_torchaudio, slow, torch_device
 from transformers.utils import cached_property
 from transformers.utils.import_utils import is_datasets_available
@@ -43,7 +42,6 @@
         WhisperForConditionalGeneration,
         WhisperModel,
         WhisperProcessor,
-        WhisperTokenizer,
         set_seed,
     )
     from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder
@@ -864,18 +862,18 @@ def test_tiny_en_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         model.to(torch_device)
         model.config.decoder_start_token_id = 50257
 
         input_speech = self._load_datasamples(1)
-        feaure_extractor = WhisperFeatureExtractor()
-
-        input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
+            torch_device
+        )
 
-        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
-        generated_ids = model.generate(input_features, num_beams=5, forced_bos_token_id=50362)
-        transcript = tokenizer.batch_decode(generated_ids)[0]
+        generated_ids = model.generate(input_features, num_beams=5)
+        transcript = processor.tokenizer.batch_decode(generated_ids)[0]
 
         EXPECTED_TRANSCRIPT = (
             "<|startoftranscript|><|notimestamps|> Mr. Quilter is the apostle of the middle"
@@ -888,20 +886,17 @@ def test_tiny_generation(self):
 
         torch_device = "cpu"
         set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
         model.to(torch_device)
-        model.config.begin_suppress_tokens = None
-        model.config.suppress_tokens = None
-        input_speech = self._load_datasamples(1)
-        feaure_extractor = WhisperFeatureExtractor()
 
-        input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
-
-        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
+        input_speech = self._load_datasamples(1)
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
+            torch_device
+        )
 
-        decoder_input_ids = torch.tensor([[50258]]).long()
-        generated_ids = model.generate(input_features, num_beams=5, decoder_input_ids=decoder_input_ids)
-        transcript = tokenizer.decode(generated_ids[0])
+        generated_ids = model.generate(input_features, num_beams=5)
+        transcript = processor.tokenizer.decode(generated_ids[0])
 
         EXPECTED_TRANSCRIPT = (
             "<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle"
@@ -913,23 +908,21 @@ def test_tiny_generation(self):
     def test_large_generation(self):
         torch_device = "cpu"
         set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
         model.to(torch_device)
 
         input_speech = self._load_datasamples(1)
-        feaure_extractor = WhisperFeatureExtractor()
-
-        input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
-
-        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
+            torch_device
+        )
 
-        decoder_input_ids = torch.tensor([[50258]]).long()
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
         generated_ids = model.generate(
             input_features,
             do_sample=False,
-            decoder_input_ids=decoder_input_ids,
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " Mr. Quilter is the apostle of the middle classes and we are glad"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
@@ -938,52 +931,34 @@ def test_large_generation(self):
     def test_large_generation_multilingual(self):
         torch_device = "cpu"
         set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
         model.to(torch_device)
-        model.config.suppress_tokens = NON_SPEECH_TOKENS_MULTI
 
         ds = load_dataset("common_voice", "ja", split="test", streaming=True)
         ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
-        ds_iter = iter(ds)
-        input_speech = next(ds_iter)["audio"]["array"]
-
-        feaure_extractor = WhisperFeatureExtractor()
-
-        input_features = feaure_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(torch_device)
-
-        tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large")
-
-        model.config.begin_suppress_tokens = [tokenizer.encode(" ")[0], tokenizer.eos_token_id]
-        decoder_input_ids = torch.tensor([[50258, 50359, 50266, 50363]]).long().to(torch_device)
-        generated_ids = model.generate(
-            input_features,
-            do_sample=True,
-            decoder_input_ids=decoder_input_ids,
+        input_speech = next(iter(ds))["audio"]["array"]
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
+            torch_device
         )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました。 木"
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="transcribe")
+        generated_ids = model.generate(input_features, do_sample=False)
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+        EXPECTED_TRANSCRIPT = "木村さんに電話を貸してもらいました"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-        decoder_input_ids = torch.tensor([[50258, 50359, 50357]]).long().to(torch_device)
-        generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            decoder_input_ids=decoder_input_ids,
-        )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
+        generated_ids = model.generate(input_features,do_sample=False,)
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kaite moraimashita."
+        EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kaite moraimashita"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
-        decoder_input_ids = torch.tensor([[50258, 50266, 50358, 50363]]).long().to(torch_device)
-        generated_ids = model.generate(
-            input_features,
-            do_sample=False,
-            decoder_input_ids=decoder_input_ids,
-        )
-        transcript = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="ja", task="translate")
+        generated_ids = model.generate(input_features, do_sample=False)
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
-        EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san. Thank you for watching. Please subscribe"
-        # should only be "I borrowed a phone from Kimura san. But it seems like it is a well known bug"
+        EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)

From c5a2581001ce1dfbc0db0867aa18b7d79742e1da Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 5 Oct 2022 14:52:44 +0200
Subject: [PATCH 153/156] Update
 src/transformers/models/whisper/configuration_whisper.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/whisper/configuration_whisper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 1874f0f06b72d..9f8861dd550f9 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -122,7 +122,7 @@ class WhisperConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie input and output embeddings.
-        suppress_tokens (`List[int]`, *optional*, defaults to None):
+        suppress_tokens (`List[int]`, *optional*):
             A list containing the non-speech tokens that will be used by the logit processor in the `generate`
             function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
             `multilingual` model.

From 2c618397b6dc20c9b95e8335d4b3e19151ff306f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 5 Oct 2022 13:02:19 +0000
Subject: [PATCH 154/156] update based on review

---
 src/transformers/generation_logits_process.py         |  7 ++++---
 src/transformers/generation_utils.py                  |  4 +++-
 .../models/whisper/configuration_whisper.py           |  2 +-
 src/transformers/models/whisper/processing_whisper.py |  2 +-
 .../models/whisper/tokenization_whisper.py            | 11 ++++++-----
 tests/models/whisper/test_modeling_whisper.py         |  5 ++++-
 6 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index f3038a400b49a..0e1414aa08bb0 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -717,7 +717,7 @@ def __init__(self, begin_suppress_tokens, begin_index):
 
     def __call__(self, input_ids, scores):
         if input_ids.shape[1] == self.begin_index:
-            scores[:, self.begin_suppress_tokens] = -np.inf
+            scores[:, self.begin_suppress_tokens] = -float("inf")
 
         return scores
 
@@ -730,7 +730,7 @@ def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)
 
     def __call__(self, input_ids, scores):
-        scores[:, self.suppress_tokens] = -np.inf
+        scores[:, self.suppress_tokens] = -float("inf")
         return scores
 
 
@@ -745,5 +745,6 @@ def __call__(self, input_ids, scores):
         generation_idx = input_ids.shape[-1]
         current_token = self.force_token_map.get(generation_idx, None)
         if current_token is not None:
-            scores[:, current_token] = np.inf
+            scores[:, :] = -float("inf")
+            scores[:, current_token] = 0
         return scores
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 1524925481480..380eec07270c9 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -779,6 +779,8 @@ def _get_logits_processor(
         if begin_suppress_tokens is not None:
             begin_index = input_ids_seq_length
             begin_index = begin_index if (input_ids_seq_length > 1 or forced_bos_token_id is None) else begin_index + 1
+            if forced_decoder_ids is not None:
+                begin_index += forced_decoder_ids[-1][0]  # generation starts after the last token that is forced
             processors.append(SuppressTokensAtBeginLogitsProcessor(begin_suppress_tokens, begin_index))
         if forced_decoder_ids is not None:
             processors.append(ForceTokensLogitsProcessor(forced_decoder_ids))
@@ -1120,7 +1122,7 @@ def generate(
                 A list of tokens that will be supressed at the begining of the generation. The `SupressBeginTokens`
                 logit processor will set their log probs to `-inf` so that they are not sampled.
             forced_decoder_ids (`List[int]`, *optional*, defaults to `model.config.forced_decoder_ids`):
-                A list of tokens that will be forced as beginning tokens.
+                A list of tokens that will be forced as beginning tokens, before sampling.
 
             model_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model. If the model
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 1874f0f06b72d..e0911bca4ad31 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -36,7 +36,7 @@
     17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
     34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360, 50361
 ]
-NON_SPEECH_TOKENS_MULTI = [    
+NON_SPEECH_TOKENS_MULTI = [
     1, 2, 6, 7, 8, 9, 10, 12, 14, 25,
     26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
     63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index 81c3e2792aa67..3bdcb0f51f2f5 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -61,7 +61,7 @@ def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
                     " '<|as|>', '<|tt|>', '<|haw|>', '<|ln|>', '<|ha|>', '<|ba|>', '<|jw|>', '<|su|>'"
                 )
             forced_decoder_tokens += f"<|{language}|>"
-        
+
         if task is not None:
             if f"<|{task}|>" not in self.tokenizer.additional_special_tokens:
                 raise ValueError(
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index bb3a55931dd19..696aa4f4e5f0f 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -101,6 +101,8 @@ class WhisperTokenizer(PreTrainedTokenizer):
             Path to the vocabulary file.
         merges_file (`str`):
             Path to the merges file.
+        normalizer_file (`str`, *optional*, defaults to `None`):
+            Path to the normalizer_file file.
         errors (`str`, *optional*, defaults to `"replace"`):
             Paradigm to follow when decoding bytes to UTF-8. See
             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
@@ -113,7 +115,10 @@ class WhisperTokenizer(PreTrainedTokenizer):
             The end of sequence token.
         add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
+            other word.
+        add_bos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial <|endoftext|> to the input. This allows to treat the leading word just as
+            any other word.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -126,8 +131,6 @@ def __init__(
         vocab_file,
         merges_file,
         normalizer_file=None,
-        task=None,
-        language="en",
         errors="replace",
         unk_token="<|endoftext|>",
         bos_token="<|endoftext|>",
@@ -153,8 +156,6 @@ def __init__(
             **kwargs,
         )
         self.add_bos_token = add_bos_token
-        self.language = language
-        self.task = task
 
         with open(vocab_file, encoding="utf-8") as vocab_handle:
             self.encoder = json.load(vocab_handle)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 395dadc321e87..f0c341540efdc 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -950,7 +950,10 @@ def test_large_generation_multilingual(self):
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
         model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
-        generated_ids = model.generate(input_features,do_sample=False,)
+        generated_ids = model.generate(
+            input_features,
+            do_sample=False,
+        )
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
         EXPECTED_TRANSCRIPT = " Kimura san ni denwa wo kaite moraimashita"

From 135be7de89c6bed1181d2071511db8a2b3c652f8 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 5 Oct 2022 15:06:46 +0200
Subject: [PATCH 155/156] Update
 src/transformers/models/whisper/configuration_whisper.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/models/whisper/configuration_whisper.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 9d7043a1260af..593b84e8611d3 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -90,11 +90,11 @@ class WhisperConfig(PretrainedConfig):
             Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
             are provided to the `generate` function. It is used to guide the model`s generation process depending on
             the task.
-        use_cache (`bool`, *optional*, defaults to True):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        is_encoder_decoder (`bool`, *optional*, defaults to True):
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
             Whether the model is used as an encoder/decoder or not.
-        activation_function (`str`, *optional*, defaults to "gelu"):
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"silu"` and `"gelu_new"` are supported.
         d_model (`int`, *optional*, defaults to 256):

From 8e047f91048d706b36694ebeb4ff47eab4decacb Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Wed, 5 Oct 2022 16:46:57 +0000
Subject: [PATCH 156/156] add batched tests

---
 .../models/whisper/configuration_whisper.py   |  2 +-
 tests/models/whisper/test_modeling_whisper.py | 75 +++++++++++++++++++
 .../whisper/test_tokenization_whisper.py      | 14 ++++
 3 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 9d7043a1260af..280176e01c0a1 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -173,7 +173,7 @@ def __init__(
         scale_embedding=False,
         max_source_positions=1500,
         max_target_positions=448,
-        pad_token_id=0,
+        pad_token_id=50256,
         bos_token_id=50257,
         eos_token_id=50256,
         tie_word_embeddings=True,
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index f0c341540efdc..e07d8122a5e5c 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -965,3 +965,78 @@ def test_large_generation_multilingual(self):
 
         EXPECTED_TRANSCRIPT = " I borrowed a phone from Kimura san"
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    @slow
+    def test_large_batched_generation(self):
+        set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
+
+        input_speech = self._load_datasamples(4)
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features
+        generated_ids = model.generate(input_features)
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                [50258, 50358, 50363, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 293, 321, 366, 5404, 281],
+                [50258, 50358, 50363, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702, 1871, 13, 50257, 50257],
+                [50258, 50358, 50363, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264, 1064, 11, 365, 5272, 293, 12904, 9256],
+                [50258, 50358, 50363, 634, 575, 12525, 22618, 1968, 6144, 35617, 20084, 1756, 311, 589, 307, 534, 10281, 934, 439, 11]
+            ]
+        )
+        # fmt: on
+
+        self.assertTrue(torch.allclose(generated_ids, EXPECTED_LOGITS))
+
+        # fmt: off
+        EXPECTED_TRANSCRIPT = [
+            ' Mr. Quilter is the apostle of the middle classes, and we are glad to',
+            " Nor is Mr. Quilter's manner less interesting than his matter.",
+            " He tells us that at this festive season of the year, with Christmas and roast beef",
+            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"
+        ]
+        # fmt: on
+
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
+
+    @slow
+    def test_tiny_en_batched_generation(self):
+        torch_device = "cuda"
+        set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model.to(torch_device)
+
+        input_speech = self._load_datasamples(4)
+        input_features = processor.feature_extractor(raw_speech=input_speech, return_tensors="pt").input_features.to(
+            torch_device
+        )
+        generated_ids = model.generate(input_features).to("cpu")
+
+        # fmt: off
+        EXPECTED_LOGITS = torch.tensor(
+            [
+                [50257, 50362, 1770, 13, 2264, 346, 353, 318, 262, 46329, 286, 262, 3504, 6097, 11, 290, 356, 389, 9675, 284],
+                [50257, 50362, 5414, 318, 1770, 13, 2264, 346, 353, 338, 5642, 1342, 3499, 621, 465, 2300, 13, 50256, 50256, 50256],
+                [50257, 50362, 679, 4952, 514, 326, 379, 428, 43856, 1622, 286, 262, 614, 11, 351, 6786, 290, 32595, 12023, 28236],
+                [50257, 50362, 679, 468, 12296, 17188, 1771, 7361, 26113, 18881, 1122, 338, 670, 318, 1107, 8312, 706, 477, 290, 460]
+            ]
+
+        )
+        # fmt: on
+
+        self.assertTrue(torch.allclose(generated_ids, EXPECTED_LOGITS))
+
+        # fmt: off
+        EXPECTED_TRANSCRIPT = [
+            " Mr. Quilter is the apostle of the middle classes, and we are glad to",
+            " Nor is Mr. Quilter's manner less interesting than his matter.",
+            " He tells us that at this festive season of the year, with Christmas and roast beef looming",
+            " He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can",
+        ]
+        # fmt: on
+
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertListEqual(transcript, EXPECTED_TRANSCRIPT)
diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py
index 3c1deb2b7d99e..4dc66a499186a 100644
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -174,3 +174,17 @@ def test_tokenizer_decode_ignores_language_codes(self):
         expected_spanish = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True)
         self.assertEqual(result, expected_spanish)
         self.assertNotIn(self.tokenizer.eos_token, result)
+
+    def test_batch_encoding(self):
+        multilingual_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
+        batch = ["<|en|><|notimestamps|>", "<|en|><|notimestamps|>I am sure that"]
+        batch_output = multilingual_tokenizer.batch_encode_plus(batch, padding=True).input_ids
+
+        # fmt: off
+        EXPECTED_MULTI = [
+            [50258, 50362, 50256, 50256, 50256, 50256],
+            [50258, 50362, 40, 716, 1654, 326]
+        ]
+        # fmt: on
+
+        self.assertListEqual(batch_output, EXPECTED_MULTI)