From 002387eef1f2eb1ba0764f545f36aa9f87b3e69e Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 28 Jul 2022 18:53:11 +0100 Subject: [PATCH 1/5] More rigorous shape inference in to_tf_dataset --- src/datasets/arrow_dataset.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index d92735ab9a5..3083ea03927 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -420,6 +420,26 @@ def to_tf_dataset( batch_size=batch_size if drop_remainder else None, ) + shape_verification_signature, _ = dataset._get_output_signature( + dataset, + collate_fn=collate_fn, + collate_fn_args=collate_fn_args, + cols_to_retain=cols_to_retain, + batch_size=2, + num_test_batches=200, + ) + + for column, tensor_spec in shape_verification_signature.items(): + shape = tensor_spec.shape.as_list() + existing_shape = output_signature[column].shape.as_list() + for i in range(len(shape)): + # Look for any unexpected None dimensions in the new shape - they indicate sneakily variable dims. + if existing_shape[i] is not None and shape[i] is None: + existing_shape[i] = None + if existing_shape != output_signature[column].shape.as_list(): + new_spec = tf.TensorSpec(shape=existing_shape, dtype=tensor_spec.dtype) + output_signature[column] = new_spec + def np_get_batch(indices): # Following the logic in `transformers.Trainer`, we do not drop `label_ids` or `label` even if they # are not in the list of requested columns, because the collator may rename them From a4ef784cf2ff45dd709fbcb5612de524954d785d Mon Sep 17 00:00:00 2001 From: matt Date: Fri, 29 Jul 2022 14:01:43 +0100 Subject: [PATCH 2/5] Simplify the new shape inference --- src/datasets/arrow_dataset.py | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 3083ea03927..8d0496dc418 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -224,7 +224,7 @@ def _get_output_signature( collate_fn_args: dict, cols_to_retain: Optional[List[str]] = None, batch_size: Optional[int] = None, - num_test_batches: int = 10, + num_test_batches: int = 200, ): """Private method used by `to_tf_dataset()` to find the shapes and dtypes of samples from this dataset after being passed through the collate_fn. Tensorflow needs an exact signature for tf.numpy_function, so @@ -253,11 +253,9 @@ def _get_output_signature( if len(dataset) == 0: raise ValueError("Unable to get the output signature because the dataset is empty.") - if batch_size is None: - test_batch_size = min(len(dataset), 8) - else: + if batch_size is not None: batch_size = min(len(dataset), batch_size) - test_batch_size = batch_size + test_batch_size = min(len(dataset), 2) test_batches = [] for _ in range(num_test_batches): @@ -420,26 +418,6 @@ def to_tf_dataset( batch_size=batch_size if drop_remainder else None, ) - shape_verification_signature, _ = dataset._get_output_signature( - dataset, - collate_fn=collate_fn, - collate_fn_args=collate_fn_args, - cols_to_retain=cols_to_retain, - batch_size=2, - num_test_batches=200, - ) - - for column, tensor_spec in shape_verification_signature.items(): - shape = tensor_spec.shape.as_list() - existing_shape = output_signature[column].shape.as_list() - for i in range(len(shape)): - # Look for any unexpected None dimensions in the new shape - they indicate sneakily variable dims. - if existing_shape[i] is not None and shape[i] is None: - existing_shape[i] = None - if existing_shape != output_signature[column].shape.as_list(): - new_spec = tf.TensorSpec(shape=existing_shape, dtype=tensor_spec.dtype) - output_signature[column] = new_spec - def np_get_batch(indices): # Following the logic in `transformers.Trainer`, we do not drop `label_ids` or `label` even if they # are not in the list of requested columns, because the collator may rename them From 144304b89a125afcea30cd0d24fff3ae48293a86 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 7 Sep 2022 18:15:42 +0100 Subject: [PATCH 3/5] Read length from Sequence features instead of just sampling batches --- src/datasets/arrow_dataset.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 8d0496dc418..cab1e898f2c 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -299,17 +299,20 @@ def _get_output_signature( f"Unrecognized array dtype {np_arrays[0].dtype}. \n" "Nested types and image/audio types are not supported yet." ) - shapes = [array.shape for array in np_arrays] - static_shape = [] - for dim in range(len(shapes[0])): - sizes = set([shape[dim] for shape in shapes]) - if dim == 0: - static_shape.append(batch_size) - continue - if len(sizes) == 1: # This dimension looks constant - static_shape.append(sizes.pop()) - else: # Use None for variable dimensions - static_shape.append(None) + if column in dataset and isinstance(dataset.features[column], Sequence) and dataset.features[column].length != -1: + static_shape = [batch_size, dataset.features[column].length] + else: + shapes = [array.shape for array in np_arrays] + static_shape = [] + for dim in range(len(shapes[0])): + sizes = set([shape[dim] for shape in shapes]) + if dim == 0: + static_shape.append(batch_size) + continue + if len(sizes) == 1: # This dimension looks constant + static_shape.append(sizes.pop()) + else: # Use None for variable dimensions + static_shape.append(None) tf_columns_to_signatures[column] = tf.TensorSpec(shape=static_shape, dtype=tf_dtype) np_columns_to_dtypes[column] = np_dtype From 31a6d58fc7f23d9cc3b8de5786b6c85c055832b6 Mon Sep 17 00:00:00 2001 From: matt Date: Wed, 7 Sep 2022 18:16:03 +0100 Subject: [PATCH 4/5] make style --- src/datasets/arrow_dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index cab1e898f2c..58a4e96796c 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -299,7 +299,11 @@ def _get_output_signature( f"Unrecognized array dtype {np_arrays[0].dtype}. \n" "Nested types and image/audio types are not supported yet." ) - if column in dataset and isinstance(dataset.features[column], Sequence) and dataset.features[column].length != -1: + if ( + column in dataset + and isinstance(dataset.features[column], Sequence) + and dataset.features[column].length != -1 + ): static_shape = [batch_size, dataset.features[column].length] else: shapes = [array.shape for array in np_arrays] From c1b98ee55064d534899e143d0e3cf92de3c0228e Mon Sep 17 00:00:00 2001 From: matt Date: Thu, 8 Sep 2022 17:41:14 +0100 Subject: [PATCH 5/5] Remove Sequence-specific code --- src/datasets/arrow_dataset.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 58a4e96796c..8d0496dc418 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -299,24 +299,17 @@ def _get_output_signature( f"Unrecognized array dtype {np_arrays[0].dtype}. \n" "Nested types and image/audio types are not supported yet." ) - if ( - column in dataset - and isinstance(dataset.features[column], Sequence) - and dataset.features[column].length != -1 - ): - static_shape = [batch_size, dataset.features[column].length] - else: - shapes = [array.shape for array in np_arrays] - static_shape = [] - for dim in range(len(shapes[0])): - sizes = set([shape[dim] for shape in shapes]) - if dim == 0: - static_shape.append(batch_size) - continue - if len(sizes) == 1: # This dimension looks constant - static_shape.append(sizes.pop()) - else: # Use None for variable dimensions - static_shape.append(None) + shapes = [array.shape for array in np_arrays] + static_shape = [] + for dim in range(len(shapes[0])): + sizes = set([shape[dim] for shape in shapes]) + if dim == 0: + static_shape.append(batch_size) + continue + if len(sizes) == 1: # This dimension looks constant + static_shape.append(sizes.pop()) + else: # Use None for variable dimensions + static_shape.append(None) tf_columns_to_signatures[column] = tf.TensorSpec(shape=static_shape, dtype=tf_dtype) np_columns_to_dtypes[column] = np_dtype