Skip to content

Commit

Permalink
remove validation tests
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Sep 2, 2022
1 parent 216bb7e commit 887d514
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 395 deletions.
17 changes: 9 additions & 8 deletions tests/test_dataset_cards.py
Expand Up @@ -20,7 +20,7 @@

from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
from datasets.utils.logging import get_logger
from datasets.utils.metadata import DatasetMetadata, validate_metadata_type, yaml_block_from_readme
from datasets.utils.metadata import DatasetMetadata
from datasets.utils.readme import ReadMe

from .utils import slow
Expand All @@ -46,7 +46,11 @@ def get_changed_datasets(repo_path: Path) -> List[Path]:


def get_all_datasets(repo_path: Path) -> List[Path]:
dataset_names = [path.parts[-1] for path in (repo_path / "datasets").iterdir() if path.is_dir()]
dataset_names = [
path.parts[-1]
for path in (repo_path / "datasets").iterdir()
if path.is_dir() and (path / path.name).with_suffix(".py").is_file()
]
return [dataset_name for dataset_name in dataset_names if dataset_name not in _PACKAGED_DATASETS_MODULES]


Expand All @@ -64,14 +68,13 @@ def test_changed_dataset_card(dataset_name):
)
try:
readme = ReadMe.from_readme(card_path, suppress_parsing_errors=True)
readme.validate()
except Exception as readme_validation_error:
error_messages.append(
f"The following issues have been found in the dataset cards:\nREADME Validation:\n{readme_validation_error}"
)
try:
metadata = DatasetMetadata.from_readme(card_path)
metadata.validate()
assert metadata, "empty metadata"
except Exception as metadata_error:
error_messages.append(
f"The following issues have been found in the dataset cards:\nYAML tags:\n{metadata_error}"
Expand All @@ -89,10 +92,8 @@ def test_dataset_card_yaml_structure(dataset_name):
"""
card_path = repo_path / "datasets" / dataset_name / "README.md"
assert card_path.exists()
yaml_string = yaml_block_from_readme(card_path)
metadata_dict = DatasetMetadata._metadata_dict_from_yaml_string(yaml_string)
metadata_dict = DatasetMetadata.from_readme(card_path)
assert len(metadata_dict) > 0
validate_metadata_type(metadata_dict)


@slow
Expand All @@ -117,7 +118,7 @@ def test_dataset_card(dataset_name):
)
try:
metadata = DatasetMetadata.from_readme(card_path)
metadata.validate()
assert metadata
except Exception as metadata_error:
error_messages.append(
f"The following issues have been found in the dataset cards:\nYAML tags:\n{metadata_error}"
Expand Down

1 comment on commit 887d514

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007635 / 0.011353 (-0.003717) 0.003990 / 0.011008 (-0.007018) 0.029699 / 0.038508 (-0.008809) 0.034837 / 0.023109 (0.011728) 0.299832 / 0.275898 (0.023934) 0.367932 / 0.323480 (0.044452) 0.006102 / 0.007986 (-0.001884) 0.003581 / 0.004328 (-0.000747) 0.007035 / 0.004250 (0.002785) 0.049395 / 0.037052 (0.012343) 0.306010 / 0.258489 (0.047521) 0.348126 / 0.293841 (0.054285) 0.030643 / 0.128546 (-0.097904) 0.009664 / 0.075646 (-0.065982) 0.257989 / 0.419271 (-0.161282) 0.053627 / 0.043533 (0.010094) 0.305716 / 0.255139 (0.050577) 0.317222 / 0.283200 (0.034022) 0.115314 / 0.141683 (-0.026368) 1.465856 / 1.452155 (0.013701) 1.516551 / 1.492716 (0.023835)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.287788 / 0.018006 (0.269782) 0.525529 / 0.000490 (0.525040) 0.007976 / 0.000200 (0.007776) 0.000236 / 0.000054 (0.000182)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.025385 / 0.037411 (-0.012026) 0.103291 / 0.014526 (0.088765) 0.117648 / 0.176557 (-0.058908) 0.166297 / 0.737135 (-0.570838) 0.121499 / 0.296338 (-0.174839)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.403040 / 0.215209 (0.187831) 4.013532 / 2.077655 (1.935877) 1.817032 / 1.504120 (0.312912) 1.623714 / 1.541195 (0.082519) 1.677041 / 1.468490 (0.208551) 0.422986 / 4.584777 (-4.161791) 3.750022 / 3.745712 (0.004310) 3.238049 / 5.269862 (-2.031813) 1.666882 / 4.565676 (-2.898794) 0.051552 / 0.424275 (-0.372723) 0.011132 / 0.007607 (0.003525) 0.509701 / 0.226044 (0.283657) 5.098254 / 2.268929 (2.829325) 2.280595 / 55.444624 (-53.164030) 1.917575 / 6.876477 (-4.958901) 2.059681 / 2.142072 (-0.082392) 0.544570 / 4.805227 (-4.260657) 0.119662 / 6.500664 (-6.381002) 0.060509 / 0.075469 (-0.014960)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.492925 / 1.841788 (-0.348862) 13.657196 / 8.074308 (5.582888) 25.298617 / 10.191392 (15.107225) 0.896221 / 0.680424 (0.215797) 0.564179 / 0.534201 (0.029978) 0.387116 / 0.579283 (-0.192167) 0.436823 / 0.434364 (0.002459) 0.275055 / 0.540337 (-0.265282) 0.273220 / 1.386936 (-1.113716)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.005885 / 0.011353 (-0.005468) 0.003943 / 0.011008 (-0.007065) 0.027930 / 0.038508 (-0.010578) 0.033258 / 0.023109 (0.010149) 0.379006 / 0.275898 (0.103108) 0.438967 / 0.323480 (0.115487) 0.004123 / 0.007986 (-0.003862) 0.003527 / 0.004328 (-0.000802) 0.005046 / 0.004250 (0.000795) 0.043709 / 0.037052 (0.006656) 0.389395 / 0.258489 (0.130906) 0.430296 / 0.293841 (0.136455) 0.030415 / 0.128546 (-0.098131) 0.009717 / 0.075646 (-0.065929) 0.257513 / 0.419271 (-0.161759) 0.055089 / 0.043533 (0.011556) 0.384835 / 0.255139 (0.129696) 0.402592 / 0.283200 (0.119392) 0.102246 / 0.141683 (-0.039437) 1.478171 / 1.452155 (0.026017) 1.509022 / 1.492716 (0.016306)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.294472 / 0.018006 (0.276466) 0.516375 / 0.000490 (0.515885) 0.001262 / 0.000200 (0.001062) 0.000082 / 0.000054 (0.000028)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.025216 / 0.037411 (-0.012196) 0.103377 / 0.014526 (0.088851) 0.115171 / 0.176557 (-0.061385) 0.158521 / 0.737135 (-0.578615) 0.121010 / 0.296338 (-0.175329)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.434571 / 0.215209 (0.219362) 4.332315 / 2.077655 (2.254660) 2.190297 / 1.504120 (0.686177) 1.997940 / 1.541195 (0.456745) 2.051332 / 1.468490 (0.582842) 0.424757 / 4.584777 (-4.160019) 3.820077 / 3.745712 (0.074365) 1.996322 / 5.269862 (-3.273540) 1.232056 / 4.565676 (-3.333621) 0.051568 / 0.424275 (-0.372707) 0.011331 / 0.007607 (0.003724) 0.537635 / 0.226044 (0.311591) 5.387688 / 2.268929 (3.118759) 2.611825 / 55.444624 (-52.832799) 2.283496 / 6.876477 (-4.592981) 2.405543 / 2.142072 (0.263471) 0.536599 / 4.805227 (-4.268628) 0.121029 / 6.500664 (-6.379635) 0.061755 / 0.075469 (-0.013714)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.551384 / 1.841788 (-0.290404) 13.861990 / 8.074308 (5.787682) 24.747299 / 10.191392 (14.555907) 0.920347 / 0.680424 (0.239923) 0.631381 / 0.534201 (0.097180) 0.387582 / 0.579283 (-0.191701) 0.435780 / 0.434364 (0.001416) 0.266558 / 0.540337 (-0.273779) 0.274529 / 1.386936 (-1.112407)

CML watermark

Please sign in to comment.