From 3749f2dd1f8c884b9afaccb101a52209b55dbfe9 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 4 Jul 2022 19:02:56 +0000
Subject: [PATCH 1/8] Deprecate

---
 src/transformers/generation_flax_utils.py |  50 +++++++--
 src/transformers/generation_tf_utils.py   | 119 +++++++++++++---------
 src/transformers/generation_utils.py      |  49 ++++-----
 3 files changed, 140 insertions(+), 78 deletions(-)

diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 59d66a0fe2b4c..0c1348d15293f 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 
+import warnings
 from functools import partial
 from typing import Dict, Optional
 
@@ -163,6 +164,7 @@ def generate(
         self,
         input_ids: jnp.ndarray,
         max_length: Optional[int] = None,
+        max_new_tokens: Optional[int] = None,
         pad_token_id: Optional[int] = None,
         bos_token_id: Optional[int] = None,
         eos_token_id: Optional[int] = None,
@@ -209,8 +211,10 @@ def generate(
 
             input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
-            max_length (`int`, *optional*, defaults to 20):
-                The maximum length of the sequence to be generated.
+            max_length (`int`, *optional*, defaults to `model.config.max_length`):
+                **DEPRECATED** The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`.
+            max_new_tokens (`int`, *optional*, defaults to None):
+                The maximum numbers of tokens to generate, ignoring the current number of tokens.
             do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
             temperature (`float`, *optional*, defaults to 1.0):
@@ -258,8 +262,6 @@ def generate(
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ```"""
         # set init values
-        max_length = max_length if max_length is not None else self.config.max_length
-        min_length = min_length if min_length is not None else self.config.min_length
         bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
         pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
         eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
@@ -270,11 +272,6 @@ def generate(
 
         if decoder_start_token_id is None and self.config.is_encoder_decoder:
             raise ValueError("`decoder_start_token_id` has to be defined for encoder-decoder generation.")
-        if min_length is not None and min_length > max_length:
-            raise ValueError(
-                f"Unfeasable length constraints: the minimum length ({min_length}) is larger than the maximum "
-                f"length ({max_length})"
-            )
 
         if self.config.is_encoder_decoder:
             # add encoder_outputs to model_kwargs
@@ -283,6 +280,41 @@ def generate(
             # prepare decoder_input_ids for generation
             input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
+        # Prepare `max_length` depending on other stopping criteria
+        # if `max_new_tokens` is passed, but not `max_length` -> set `max_length = max_new_tokens`
+        input_ids_seq_length = input_ids.shape[-1]
+        if max_length is not None:
+            warnings.warn(
+                "The `max_length` argument is deprecated and will be removed in v5. Use `max_new_tokens` instead.",
+                FutureWarning,
+            )
+        if max_length is None and max_new_tokens is not None:
+            max_length = max_new_tokens + input_ids_seq_length
+        elif max_length is not None and max_new_tokens is not None:
+            # Both are set, this is odd, raise a warning
+            warnings.warn(
+                "Both `max_length` and `max_new_tokens` have been set "
+                f"but they serve the same purpose. `max_length` {max_length} "
+                f"will take priority over `max_new_tokens` {max_new_tokens}.",
+                UserWarning,
+            )
+        # default to config if still None
+        max_length = max_length if max_length is not None else self.config.max_length
+        min_length = min_length if min_length is not None else self.config.min_length
+
+        if min_length is not None and min_length > max_length:
+            raise ValueError(
+                f"Unfeasable length constraints: the minimum length ({min_length}) is larger than the maximum "
+                f"length ({max_length})"
+            )
+        if input_ids_seq_length >= max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but ``max_length`` is set to"
+                f" {max_length}. This can lead to unexpected behavior. You should consider increasing"
+                "``max_new_tokens``."
+            )
+
         do_sample = do_sample if do_sample is not None else self.config.do_sample
         num_beams = num_beams if num_beams is not None else self.config.num_beams
 
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index 1a9b19e2b595b..edac7f63840fe 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import inspect
+import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -53,8 +54,8 @@ class TFGreedySearchDecoderOnlyOutput(ModelOutput):
             if all batches finished early due to the `eos_token_id`.
         scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor
-            of shape `(batch_size, config.vocab_size)`).
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
         attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
@@ -83,8 +84,8 @@ class TFGreedySearchEncoderDecoderOutput(ModelOutput):
             if all batches finished early due to the `eos_token_id`.
         scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
-            `(batch_size, config.vocab_size)`).
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
         encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
@@ -123,8 +124,8 @@ class TFSampleDecoderOnlyOutput(ModelOutput):
             if all batches finished early due to the `eos_token_id`.
         scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor
-            of shape `(batch_size*num_return_sequences, config.vocab_size)`).
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
         attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `tf.Tensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`.
@@ -153,8 +154,8 @@ class TFSampleEncoderDecoderOutput(ModelOutput):
             if all batches finished early due to the `eos_token_id`.
         scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
-            `(batch_size*num_return_sequences, config.vocab_size)`).
+            at each generation step. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each
+            generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
         encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size*num_return_sequences,
             num_heads, sequence_length, sequence_length)`.
@@ -194,9 +195,9 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
             Final beam scores of the generated `sequences`.
         scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
-            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape
-            `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
+            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
         attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
@@ -227,9 +228,9 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
             Final beam scores of the generated `sequences`.
         scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
-            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape `(batch_size*num_beams,
-            config.vocab_size)`).
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
+            beam. `Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
         attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
         encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
@@ -272,9 +273,9 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
             Final beam scores of the generated `sequences`.
         scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
-            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape
-            `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
+            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
         attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
@@ -305,9 +306,9 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
             Final beam scores of the generated `sequences`.
         scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
-            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape `(batch_size*num_beams,
-            config.vocab_size)`).
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this
+            beam. Tuple of `tf.Tensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
         encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length,
             sequence_length)`.
@@ -375,6 +376,7 @@ def generate(
         self,
         input_ids=None,
         max_length=None,
+        max_new_tokens=None,
         min_length=None,
         do_sample=None,
         early_stopping=None,
@@ -423,8 +425,10 @@ def generate(
                 method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                 should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                 `input_ids`, `input_values`, `input_features`, or `pixel_values`.
-            max_length (`int`, *optional*, defaults to 20):
-                The maximum length of the sequence to be generated.
+            max_length (`int`, *optional*, defaults to `model.config.max_length`):
+                **DEPRECATED** The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`.
+            max_new_tokens (`int`, *optional*, defaults to None):
+                The maximum numbers of tokens to generate, ignoring the current number of tokens.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -577,6 +581,7 @@ def generate(
             return self._generate(
                 input_ids=input_ids,
                 max_length=max_length,
+                max_new_tokens=max_new_tokens,
                 min_length=min_length,
                 do_sample=do_sample,
                 early_stopping=early_stopping,
@@ -1286,6 +1291,7 @@ def _generate(
         self,
         input_ids=None,
         max_length=None,
+        max_new_tokens=None,
         min_length=None,
         do_sample=None,
         early_stopping=None,
@@ -1332,8 +1338,10 @@ def _generate(
             input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
                 The sequence used as a prompt for the generation. If `None` the method initializes it with
                 `bos_token_id` and a batch size of 1.
-            max_length (`int`, *optional*, defaults to 20):
-                The maximum length of the sequence to be generated.
+            max_length (`int`, *optional*, defaults to `model.config.max_length`):
+                **DEPRECATED** The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`.
+            max_new_tokens (`int`, *optional*, defaults to None):
+                The maximum numbers of tokens to generate, ignoring the current number of tokens.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -1474,8 +1482,6 @@ def _generate(
         outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)
         ```"""
         # 1. Set generation parameters if not already defined
-        max_length = max_length if max_length is not None else self.config.max_length
-        min_length = min_length if min_length is not None else self.config.min_length
         length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
         early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
 
@@ -1514,12 +1520,6 @@ def _generate(
             logger.warning(f"Setting `pad_token_id` to {eos_token_id} (first `eos_token_id`) to generate sequence")
             pad_token_id = eos_token_id
 
-        if min_length is not None and min_length > max_length:
-            raise ValueError(
-                f"Unfeasable length constraints: the minimum length ({min_length}) is larger than the maximum "
-                f"length ({max_length})"
-            )
-
         use_xla = not tf.executing_eagerly()
         if use_xla and not self.supports_xla_generation:
             raise ValueError(
@@ -1561,21 +1561,48 @@ def _generate(
                 model_kwargs=model_kwargs,
             )
 
-        if input_ids.shape[-1] >= max_length:
+        # 5. Prepare `max_length` depending on other stopping criteria
+        # if `max_new_tokens` is passed, but not `max_length` -> set `max_length = max_new_tokens`
+        input_ids_seq_length = input_ids.shape[-1]
+        if max_length is not None:
+            warnings.warn(
+                "The `max_length` argument is deprecated and will be removed in v5. Use `max_new_tokens` instead.",
+                FutureWarning,
+            )
+        if max_length is None and max_new_tokens is not None:
+            max_length = max_new_tokens + input_ids_seq_length
+        elif max_length is not None and max_new_tokens is not None:
+            # Both are set, this is odd, raise a warning
+            warnings.warn(
+                "Both `max_length` and `max_new_tokens` have been set "
+                f"but they serve the same purpose. `max_length` {max_length} "
+                f"will take priority over `max_new_tokens` {max_new_tokens}.",
+                UserWarning,
+            )
+        # default to config if still None
+        max_length = max_length if max_length is not None else self.config.max_length
+        min_length = min_length if min_length is not None else self.config.min_length
+
+        if min_length is not None and min_length > max_length:
             raise ValueError(
-                f"The context has {input_ids.shape[-1]} number of tokens, "
-                f"but `max_length` is only {max_length}. "
-                "Please make sure that `max_length` is bigger than the number of tokens, "
-                "by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
+                f"Unfeasable length constraints: the minimum length ({min_length}) is larger than the maximum "
+                f"length ({max_length})"
+            )
+        if input_ids_seq_length >= max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but ``max_length`` is set to"
+                f" {max_length}. This can lead to unexpected behavior. You should consider increasing"
+                "``max_new_tokens``."
             )
 
-        # 5. determine generation mode
+        # 6. determine generation mode
         # TODO(Matt, Joao, Patrick) - add more use cases here
         is_greedy_gen_mode = (num_beams == 1) and do_sample is False
         is_sample_gen_mode = (num_beams == 1) and do_sample is True
         is_beam_gen_mode = (num_beams > 1) and do_sample is False
 
-        # 6. prepare distribution pre_processing samplers
+        # 7. prepare distribution pre_processing samplers
         logits_processor = self._get_logits_processor(
             repetition_penalty=repetition_penalty,
             no_repeat_ngram_size=no_repeat_ngram_size,
@@ -1587,13 +1614,13 @@ def _generate(
             forced_eos_token_id=forced_eos_token_id,
         )
 
-        # 7. go into different generation modes
+        # 8. go into different generation modes
         if is_greedy_gen_mode:
             if num_return_sequences > 1:
                 raise ValueError(
                     f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search."
                 )
-            # 8. run greedy search
+            # 9. run greedy search
             return self.greedy_search(
                 input_ids,
                 max_length=max_length,
@@ -1605,10 +1632,10 @@ def _generate(
                 **model_kwargs,
             )
         elif is_sample_gen_mode:
-            # 8. prepare logits warper
+            # 9. prepare logits warper
             logits_warper = self._get_logits_warper(top_k=top_k, top_p=top_p, temperature=temperature)
 
-            # 9. expand input_ids with `num_return_sequences` additional sequences per batch
+            # 10. expand input_ids with `num_return_sequences` additional sequences per batch
             input_ids, model_kwargs = self._expand_inputs_for_generation(
                 input_ids,
                 expand_size=num_return_sequences,
@@ -1616,7 +1643,7 @@ def _generate(
                 **model_kwargs,
             )
 
-            # 10. run sample
+            # 11. run sample
             return self.sample(
                 input_ids,
                 logits_processor=logits_processor,
@@ -1637,7 +1664,7 @@ def _generate(
                     f"num_beams >= num_return_sequences, got {num_beams} and {num_return_sequences} (respectivelly)"
                 )
 
-            # 8. broadcast inputs to the desired number of beams
+            # 9. broadcast inputs to the desired number of beams
             input_ids = self._expand_to_num_beams(input_ids, num_beams=num_beams)
 
             if "encoder_outputs" in model_kwargs:
@@ -1650,7 +1677,7 @@ def _generate(
                     model_kwargs["attention_mask"], num_beams=num_beams
                 )
 
-            # 9. run beam search
+            # 10. run beam search
             return self.beam_search(
                 input_ids,
                 max_length=max_length,
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 1792545e45476..bb2722464a875 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -70,8 +70,8 @@ class GreedySearchDecoderOnlyOutput(ModelOutput):
             if all batches finished early due to the `eos_token_id`.
         scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each
-            tensor of shape `(batch_size, config.vocab_size)`).
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
         attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
@@ -100,8 +100,8 @@ class GreedySearchEncoderDecoderOutput(ModelOutput):
             if all batches finished early due to the `eos_token_id`.
         scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape
-            `(batch_size, config.vocab_size)`).
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
         encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
             sequence_length, sequence_length)`.
@@ -140,8 +140,8 @@ class SampleDecoderOnlyOutput(ModelOutput):
             if all batches finished early due to the `eos_token_id`.
         scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each
-            tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`).
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
         attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
             `torch.FloatTensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length,
@@ -171,8 +171,8 @@ class SampleEncoderDecoderOutput(ModelOutput):
             if all batches finished early due to the `eos_token_id`.
         scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape
-            `(batch_size*num_return_sequences, config.vocab_size)`).
+            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
+            each generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
         encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape
             `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
@@ -214,8 +214,8 @@ class BeamSearchDecoderOnlyOutput(ModelOutput):
         scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
             of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
-            `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each tensor of shape
-            `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
         beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
             `(batch_size*num_return_sequences, input_ids.shape[-1])`.
@@ -251,8 +251,8 @@ class BeamSearchEncoderDecoderOutput(ModelOutput):
         scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
             of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
-            `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape `(batch_size*num_beams,
-            config.vocab_size)`).
+            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
         beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
             `(batch_size*num_return_sequences, max_length-1)`.
@@ -300,8 +300,8 @@ class BeamSampleDecoderOnlyOutput(ModelOutput):
         scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
             of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
-            `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each tensor of shape
-            `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
         beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
             `(batch_size*num_return_sequences, input_ids.shape[-1])`.
@@ -337,8 +337,8 @@ class BeamSampleEncoderDecoderOutput(ModelOutput):
         scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
             of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
-            `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape `(batch_size*num_beams,
-            config.vocab_size)`).
+            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
+            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`).
         beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
             `(batch_size*num_return_sequences, max_length-1)`.
@@ -923,10 +923,9 @@ def generate(
                 should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                 `input_ids`, `input_values`, `input_features`, or `pixel_values`.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                The maximum length of the sequence to be generated.
+                **DEPRECATED** The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`.
             max_new_tokens (`int`, *optional*, defaults to None):
-                The maximum numbers of tokens to generate, ignore the current number of tokens. Use either
-                `max_new_tokens` or `max_length` but not both, they serve the same purpose.
+                The maximum numbers of tokens to generate, ignoring the current number of tokens.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -1195,10 +1194,14 @@ def generate(
             # if decoder-only then inputs_tensor has to be `input_ids`
             input_ids = inputs_tensor
 
-        input_ids_seq_length = input_ids.shape[-1]
-
         # 5. Prepare `max_length` depending on other stopping criteria
         # if `max_new_tokens` is passed, but not `max_length` -> set `max_length = max_new_tokens`
+        input_ids_seq_length = input_ids.shape[-1]
+        if max_length is not None:
+            warnings.warn(
+                "The `max_length` argument is deprecated and will be removed in v5. Use `max_new_tokens` instead.",
+                FutureWarning,
+            )
         if max_length is None and max_new_tokens is not None:
             max_length = max_new_tokens + input_ids_seq_length
         elif max_length is not None and max_new_tokens is not None:
@@ -1222,8 +1225,8 @@ def generate(
             input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
             logger.warning(
                 f"Input length of {input_ids_string} is {input_ids_seq_length}, but ``max_length`` is set to"
-                f" {max_length}. This can lead to unexpected behavior. You should consider increasing"
-                " ``config.max_length`` or ``max_length``."
+                f" {max_length}. This can lead to unexpected behavior. You should consider increasing "
+                "``max_new_tokens``."
             )
 
         # 6. determine generation mode

From da10d52ff4e259d2aea58bea0a73c71cef63b2c2 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 11 Jul 2022 18:42:48 +0000
Subject: [PATCH 2/8] PR suggestions

---
 src/transformers/generation_flax_utils.py | 30 +++++++++++------------
 src/transformers/generation_tf_utils.py   | 29 +++++++++++-----------
 src/transformers/generation_utils.py      | 26 ++++++++++----------
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 0c1348d15293f..1237fadbd7853 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -212,7 +212,8 @@ def generate(
             input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                **DEPRECATED** The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`.
+                The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
+                the number of tokens in the prompt.
             max_new_tokens (`int`, *optional*, defaults to None):
                 The maximum numbers of tokens to generate, ignoring the current number of tokens.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -280,23 +281,22 @@ def generate(
             # prepare decoder_input_ids for generation
             input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-        # Prepare `max_length` depending on other stopping criteria
-        # if `max_new_tokens` is passed, but not `max_length` -> set `max_length = max_new_tokens`
+        # Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = input_ids.shape[-1]
-        if max_length is not None:
+        if max_length is None and max_new_tokens is None:
             warnings.warn(
-                "The `max_length` argument is deprecated and will be removed in v5. Use `max_new_tokens` instead.",
-                FutureWarning,
+                "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
+                f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
+                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length"
+                "of the generation.",
+                UserWarning,
             )
-        if max_length is None and max_new_tokens is not None:
+        elif max_length is None and max_new_tokens is not None:
             max_length = max_new_tokens + input_ids_seq_length
         elif max_length is not None and max_new_tokens is not None:
-            # Both are set, this is odd, raise a warning
-            warnings.warn(
-                "Both `max_length` and `max_new_tokens` have been set "
-                f"but they serve the same purpose. `max_length` {max_length} "
-                f"will take priority over `max_new_tokens` {max_new_tokens}.",
-                UserWarning,
+            raise ValueError(
+                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a "
+                "limit to the generated output length. Please refer to the documentation for more information."
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length
@@ -310,9 +310,9 @@ def generate(
         if input_ids_seq_length >= max_length:
             input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
             logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but ``max_length`` is set to"
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
                 f" {max_length}. This can lead to unexpected behavior. You should consider increasing"
-                "``max_new_tokens``."
+                "`max_new_tokens`."
             )
 
         do_sample = do_sample if do_sample is not None else self.config.do_sample
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index edac7f63840fe..643e8479c23cf 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -426,7 +426,8 @@ def generate(
                 should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                 `input_ids`, `input_values`, `input_features`, or `pixel_values`.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                **DEPRECATED** The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`.
+                The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
+                the number of tokens in the prompt.
             max_new_tokens (`int`, *optional*, defaults to None):
                 The maximum numbers of tokens to generate, ignoring the current number of tokens.
             min_length (`int`, *optional*, defaults to 10):
@@ -1339,7 +1340,8 @@ def _generate(
                 The sequence used as a prompt for the generation. If `None` the method initializes it with
                 `bos_token_id` and a batch size of 1.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                **DEPRECATED** The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`.
+                The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
+                the number of tokens in the prompt.
             max_new_tokens (`int`, *optional*, defaults to None):
                 The maximum numbers of tokens to generate, ignoring the current number of tokens.
             min_length (`int`, *optional*, defaults to 10):
@@ -1561,23 +1563,22 @@ def _generate(
                 model_kwargs=model_kwargs,
             )
 
-        # 5. Prepare `max_length` depending on other stopping criteria
-        # if `max_new_tokens` is passed, but not `max_length` -> set `max_length = max_new_tokens`
+        # 5. Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = input_ids.shape[-1]
-        if max_length is not None:
+        if max_length is None and max_new_tokens is None:
             warnings.warn(
-                "The `max_length` argument is deprecated and will be removed in v5. Use `max_new_tokens` instead.",
-                FutureWarning,
+                "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
+                f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
+                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length"
+                "of the generation.",
+                UserWarning,
             )
-        if max_length is None and max_new_tokens is not None:
+        elif max_length is None and max_new_tokens is not None:
             max_length = max_new_tokens + input_ids_seq_length
         elif max_length is not None and max_new_tokens is not None:
-            # Both are set, this is odd, raise a warning
-            warnings.warn(
-                "Both `max_length` and `max_new_tokens` have been set "
-                f"but they serve the same purpose. `max_length` {max_length} "
-                f"will take priority over `max_new_tokens` {max_new_tokens}.",
-                UserWarning,
+            raise ValueError(
+                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a "
+                "limit to the generated output length. Please refer to the documentation for more information."
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index bb2722464a875..04f15555141cf 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -923,7 +923,8 @@ def generate(
                 should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                 `input_ids`, `input_values`, `input_features`, or `pixel_values`.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                **DEPRECATED** The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`.
+                The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
+                the number of tokens in the prompt.
             max_new_tokens (`int`, *optional*, defaults to None):
                 The maximum numbers of tokens to generate, ignoring the current number of tokens.
             min_length (`int`, *optional*, defaults to 10):
@@ -1194,23 +1195,22 @@ def generate(
             # if decoder-only then inputs_tensor has to be `input_ids`
             input_ids = inputs_tensor
 
-        # 5. Prepare `max_length` depending on other stopping criteria
-        # if `max_new_tokens` is passed, but not `max_length` -> set `max_length = max_new_tokens`
+        # 5. Prepare `max_length` depending on other stopping criteria.
         input_ids_seq_length = input_ids.shape[-1]
-        if max_length is not None:
+        if max_length is None and max_new_tokens is None:
             warnings.warn(
-                "The `max_length` argument is deprecated and will be removed in v5. Use `max_new_tokens` instead.",
-                FutureWarning,
+                "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
+                f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
+                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length"
+                "of the generation.",
+                UserWarning,
             )
-        if max_length is None and max_new_tokens is not None:
+        elif max_length is None and max_new_tokens is not None:
             max_length = max_new_tokens + input_ids_seq_length
         elif max_length is not None and max_new_tokens is not None:
-            # Both are set, this is odd, raise a warning
-            warnings.warn(
-                "Both `max_length` and `max_new_tokens` have been set "
-                f"but they serve the same purpose. `max_length` {max_length} "
-                f"will take priority over `max_new_tokens` {max_new_tokens}.",
-                UserWarning,
+            raise ValueError(
+                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a "
+                "limit to the generated output length. Please refer to the documentation for more information."
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length

From 3feccd5365344f55b2c8b2fc39d8e4bcce8b5733 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 11 Jul 2022 18:45:32 +0000
Subject: [PATCH 3/8] improve max_new_tokens param message

---
 src/transformers/generation_flax_utils.py | 2 +-
 src/transformers/generation_tf_utils.py   | 4 ++--
 src/transformers/generation_utils.py      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 1237fadbd7853..31b369561d5fc 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -215,7 +215,7 @@ def generate(
                 The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
                 the number of tokens in the prompt.
             max_new_tokens (`int`, *optional*, defaults to None):
-                The maximum numbers of tokens to generate, ignoring the current number of tokens.
+                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
             temperature (`float`, *optional*, defaults to 1.0):
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index 643e8479c23cf..bc07f7d866b39 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -429,7 +429,7 @@ def generate(
                 The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
                 the number of tokens in the prompt.
             max_new_tokens (`int`, *optional*, defaults to None):
-                The maximum numbers of tokens to generate, ignoring the current number of tokens.
+                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -1343,7 +1343,7 @@ def _generate(
                 The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
                 the number of tokens in the prompt.
             max_new_tokens (`int`, *optional*, defaults to None):
-                The maximum numbers of tokens to generate, ignoring the current number of tokens.
+                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
             do_sample (`bool`, *optional*, defaults to `False`):
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 04f15555141cf..4f4bbedbb1f09 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -926,7 +926,7 @@ def generate(
                 The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
                 the number of tokens in the prompt.
             max_new_tokens (`int`, *optional*, defaults to None):
-                The maximum numbers of tokens to generate, ignoring the current number of tokens.
+                The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
             do_sample (`bool`, *optional*, defaults to `False`):

From c6f1e955afa7f148bacca835d74986b4c4c05158 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 11 Jul 2022 18:47:29 +0000
Subject: [PATCH 4/8] remove double backticks

---
 src/transformers/generation_tf_utils.py | 4 ++--
 src/transformers/generation_utils.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index bc07f7d866b39..dd01834cd4876 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -1592,9 +1592,9 @@ def _generate(
         if input_ids_seq_length >= max_length:
             input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
             logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but ``max_length`` is set to"
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
                 f" {max_length}. This can lead to unexpected behavior. You should consider increasing"
-                "``max_new_tokens``."
+                "`max_new_tokens`."
             )
 
         # 6. determine generation mode
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 4f4bbedbb1f09..a63157445660a 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1224,9 +1224,9 @@ def generate(
         if input_ids_seq_length >= max_length:
             input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
             logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but ``max_length`` is set to"
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
                 f" {max_length}. This can lead to unexpected behavior. You should consider increasing "
-                "``max_new_tokens``."
+                "`max_new_tokens`."
             )
 
         # 6. determine generation mode

From 987fb1820afae84f86521f084a73f55280d2dfff Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 12 Jul 2022 11:20:09 +0000
Subject: [PATCH 5/8] Apply PR suggestions

---
 src/transformers/generation_flax_utils.py | 7 ++++---
 src/transformers/generation_tf_utils.py   | 9 +++++----
 src/transformers/generation_utils.py      | 9 +++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 31b369561d5fc..39cb87de56af6 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -214,7 +214,7 @@ def generate(
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
                 The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
                 the number of tokens in the prompt.
-            max_new_tokens (`int`, *optional*, defaults to None):
+            max_new_tokens (`int`, *optional*):
                 The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
@@ -295,8 +295,9 @@ def generate(
             max_length = max_new_tokens + input_ids_seq_length
         elif max_length is not None and max_new_tokens is not None:
             raise ValueError(
-                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a "
-                "limit to the generated output length. Please refer to the documentation for more information."
+                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
+                " limit to the generated output length. Remove one of those arguments. Please refer to the"
+                " documentation for more information."
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index dd01834cd4876..d890d90c0a103 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -428,7 +428,7 @@ def generate(
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
                 The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
                 the number of tokens in the prompt.
-            max_new_tokens (`int`, *optional*, defaults to None):
+            max_new_tokens (`int`, *optional*):
                 The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
@@ -1342,7 +1342,7 @@ def _generate(
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
                 The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
                 the number of tokens in the prompt.
-            max_new_tokens (`int`, *optional*, defaults to None):
+            max_new_tokens (`int`, *optional*):
                 The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
@@ -1577,8 +1577,9 @@ def _generate(
             max_length = max_new_tokens + input_ids_seq_length
         elif max_length is not None and max_new_tokens is not None:
             raise ValueError(
-                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a "
-                "limit to the generated output length. Please refer to the documentation for more information."
+                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
+                " limit to the generated output length. Remove one of those arguments. Please refer to the"
+                " documentation for more information."
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index a63157445660a..4aa56c50f4946 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -925,7 +925,7 @@ def generate(
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
                 The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
                 the number of tokens in the prompt.
-            max_new_tokens (`int`, *optional*, defaults to None):
+            max_new_tokens (`int`, *optional*):
                 The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
@@ -974,7 +974,7 @@ def generate(
                 where one can allow different forms of each word.
             num_return_sequences(`int`, *optional*, defaults to 1):
                 The number of independently computed returned sequences for each element in the batch.
-            max_time(`float`, *optional*, defaults to None):
+            max_time(`float`, *optional*):
                 The maximum amount of time you allow the computation to run for in seconds. generation will still
                 finish the current pass after allocated time has been passed.
             attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1209,8 +1209,9 @@ def generate(
             max_length = max_new_tokens + input_ids_seq_length
         elif max_length is not None and max_new_tokens is not None:
             raise ValueError(
-                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a "
-                "limit to the generated output length. Please refer to the documentation for more information."
+                "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
+                " limit to the generated output length. Remove one of those arguments. Please refer to the"
+                " documentation for more information."
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length

From 573bc4bb233840bab3a2ee236e618ad464b21a86 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 12 Jul 2022 11:32:05 +0000
Subject: [PATCH 6/8] add missing spaces

---
 src/transformers/generation_flax_utils.py | 2 +-
 src/transformers/generation_tf_utils.py   | 2 +-
 src/transformers/generation_utils.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 39cb87de56af6..896024ba2513b 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -287,7 +287,7 @@ def generate(
             warnings.warn(
                 "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
                 f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
-                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length"
+                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length "
                 "of the generation.",
                 UserWarning,
             )
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index d890d90c0a103..4c388c1eac418 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -1569,7 +1569,7 @@ def _generate(
             warnings.warn(
                 "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
                 f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
-                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length"
+                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length "
                 "of the generation.",
                 UserWarning,
             )
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 4aa56c50f4946..cb18d8159c5b8 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1201,7 +1201,7 @@ def generate(
             warnings.warn(
                 "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
                 f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
-                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length"
+                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length "
                 "of the generation.",
                 UserWarning,
             )

From fa5fc8cc916d22912c05cb9d6497a03ceb865cea Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Sat, 23 Jul 2022 12:39:41 +0000
Subject: [PATCH 7/8] PR suggestions from Patrick (improve docstring, warning,
 and error message)

---
 src/transformers/generation_flax_utils.py | 14 ++++++++------
 src/transformers/generation_tf_utils.py   | 19 +++++++++++--------
 src/transformers/generation_utils.py      | 11 ++++++-----
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 896024ba2513b..2f80c7fcf27e9 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -212,8 +212,9 @@ def generate(
             input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
-                the number of tokens in the prompt.
+                The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+                `max_new_tokens`. In general, prefer the use of `max_new_tokens`, which ignores the number of tokens in
+                the prompt.
             max_new_tokens (`int`, *optional*):
                 The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             do_sample (`bool`, *optional*, defaults to `False`):
@@ -286,9 +287,9 @@ def generate(
         if max_length is None and max_new_tokens is None:
             warnings.warn(
                 "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
-                f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
-                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length "
-                "of the generation.",
+                f"{self.config.max_length} (`self.config.max_length`). Controlling `max_length` via the config is "
+                "deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend "
+                "using `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
         elif max_length is None and max_new_tokens is not None:
@@ -297,7 +298,8 @@ def generate(
             raise ValueError(
                 "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
                 " limit to the generated output length. Remove one of those arguments. Please refer to the"
-                " documentation for more information."
+                " documentation for more information. "
+                "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index 4c388c1eac418..ec9a61e90099f 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -426,8 +426,9 @@ def generate(
                 should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                 `input_ids`, `input_values`, `input_features`, or `pixel_values`.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
-                the number of tokens in the prompt.
+                The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+                `max_new_tokens`. In general, prefer the use of `max_new_tokens`, which ignores the number of tokens in
+                the prompt.
             max_new_tokens (`int`, *optional*):
                 The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
@@ -1340,8 +1341,9 @@ def _generate(
                 The sequence used as a prompt for the generation. If `None` the method initializes it with
                 `bos_token_id` and a batch size of 1.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
-                the number of tokens in the prompt.
+                The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+                `max_new_tokens`. In general, prefer the use of `max_new_tokens`, which ignores the number of tokens in
+                the prompt.
             max_new_tokens (`int`, *optional*):
                 The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
@@ -1568,9 +1570,9 @@ def _generate(
         if max_length is None and max_new_tokens is None:
             warnings.warn(
                 "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
-                f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
-                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length "
-                "of the generation.",
+                f"{self.config.max_length} (`self.config.max_length`). Controlling `max_length` via the config is "
+                "deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend "
+                "using `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
         elif max_length is None and max_new_tokens is not None:
@@ -1579,7 +1581,8 @@ def _generate(
             raise ValueError(
                 "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
                 " limit to the generated output length. Remove one of those arguments. Please refer to the"
-                " documentation for more information."
+                " documentation for more information. "
+                "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index cb18d8159c5b8..87e5b500f282d 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -923,8 +923,9 @@ def generate(
                 should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                 `input_ids`, `input_values`, `input_features`, or `pixel_values`.
             max_length (`int`, *optional*, defaults to `model.config.max_length`):
-                The maximum length of the sequence to be generated. Prefer the use of `max_new_tokens`, which ignores
-                the number of tokens in the prompt.
+                The maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+                `max_new_tokens`. In general, prefer the use of `max_new_tokens`, which ignores the number of tokens in
+                the prompt.
             max_new_tokens (`int`, *optional*):
                 The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
             min_length (`int`, *optional*, defaults to 10):
@@ -1200,9 +1201,9 @@ def generate(
         if max_length is None and max_new_tokens is None:
             warnings.warn(
                 "Neither `max_length` nor `max_new_tokens` have been set, `max_length` will default to "
-                f"{self.config.max_length} (`self.config.max_length`). This behavior is deprecated and will be "
-                "removed in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length "
-                "of the generation.",
+                f"{self.config.max_length} (`self.config.max_length`). Controlling `max_length` via the config is "
+                "deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend "
+                "using `max_new_tokens` to control the maximum length of the generation.",
                 UserWarning,
             )
         elif max_length is None and max_new_tokens is not None:

From 74a5bf7c31482837cb2d6a4570287392c46bf59b Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Sat, 23 Jul 2022 16:37:26 +0000
Subject: [PATCH 8/8] update tests

---
 src/transformers/generation_utils.py      | 3 ++-
 tests/generation/test_generation_utils.py | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 87e5b500f282d..4a2f9dfdfe17b 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1212,7 +1212,8 @@ def generate(
             raise ValueError(
                 "Both `max_new_tokens` and `max_length` have been set but they serve the same purpose -- setting a"
                 " limit to the generated output length. Remove one of those arguments. Please refer to the"
-                " documentation for more information."
+                " documentation for more information. "
+                "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
             )
         # default to config if still None
         max_length = max_length if max_length is not None else self.config.max_length
diff --git a/tests/generation/test_generation_utils.py b/tests/generation/test_generation_utils.py
index ce12d631bf39c..56227403ae60b 100644
--- a/tests/generation/test_generation_utils.py
+++ b/tests/generation/test_generation_utils.py
@@ -2023,8 +2023,8 @@ def test_max_new_tokens_encoder_decoder(self):
         # 1 BOS + 20 + 3 new tokens
         self.assertEqual(list(outputs.shape), [1, 24])
 
-        # max_new_tokens and max_length serve the same purpose and should not be used together.
-        with self.assertWarns(UserWarning):
+        # max_new_tokens and max_length serve the same purpose and must not be used together.
+        with self.assertRaises(ValueError):
             bart_model.generate(decoder_input_ids=input_ids, max_new_tokens=10, max_length=20)
 
     def test_max_new_tokens_decoder_only(self):
@@ -2050,8 +2050,8 @@ def test_max_new_tokens_decoder_only(self):
         # 1 BOS token + 23 new tokens
         self.assertEqual(list(outputs.shape), [1, 24])
 
-        # max_new_tokens and max_length serve the same purpose and should not be used together.
-        with self.assertWarns(UserWarning):
+        # max_new_tokens and max_length serve the same purpose and must not be used together.
+        with self.assertRaises(ValueError):
             gpt2_model.generate(decoder_input_ids=input_ids, max_new_tokens=10, max_length=20)
 
     def test_encoder_decoder_generate_with_inputs_embeds(self):