diff --git a/tests/test_dataset_cards.py b/tests/test_dataset_cards.py index cd56c9f32c9..44c19c26810 100644 --- a/tests/test_dataset_cards.py +++ b/tests/test_dataset_cards.py @@ -20,7 +20,7 @@ from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES from datasets.utils.logging import get_logger -from datasets.utils.metadata import DatasetMetadata, validate_metadata_type, yaml_block_from_readme +from datasets.utils.metadata import DatasetMetadata from datasets.utils.readme import ReadMe from .utils import slow @@ -46,7 +46,11 @@ def get_changed_datasets(repo_path: Path) -> List[Path]: def get_all_datasets(repo_path: Path) -> List[Path]: - dataset_names = [path.parts[-1] for path in (repo_path / "datasets").iterdir() if path.is_dir()] + dataset_names = [ + path.parts[-1] + for path in (repo_path / "datasets").iterdir() + if path.is_dir() and (path / path.name).with_suffix(".py").is_file() + ] return [dataset_name for dataset_name in dataset_names if dataset_name not in _PACKAGED_DATASETS_MODULES] @@ -64,14 +68,13 @@ def test_changed_dataset_card(dataset_name): ) try: readme = ReadMe.from_readme(card_path, suppress_parsing_errors=True) - readme.validate() except Exception as readme_validation_error: error_messages.append( f"The following issues have been found in the dataset cards:\nREADME Validation:\n{readme_validation_error}" ) try: metadata = DatasetMetadata.from_readme(card_path) - metadata.validate() + assert metadata, "empty metadata" except Exception as metadata_error: error_messages.append( f"The following issues have been found in the dataset cards:\nYAML tags:\n{metadata_error}" @@ -89,10 +92,8 @@ def test_dataset_card_yaml_structure(dataset_name): """ card_path = repo_path / "datasets" / dataset_name / "README.md" assert card_path.exists() - yaml_string = yaml_block_from_readme(card_path) - metadata_dict = DatasetMetadata._metadata_dict_from_yaml_string(yaml_string) + metadata_dict = DatasetMetadata.from_readme(card_path) assert len(metadata_dict) > 0 - validate_metadata_type(metadata_dict) @slow @@ -117,7 +118,7 @@ def test_dataset_card(dataset_name): ) try: metadata = DatasetMetadata.from_readme(card_path) - metadata.validate() + assert metadata except Exception as metadata_error: error_messages.append( f"The following issues have been found in the dataset cards:\nYAML tags:\n{metadata_error}" diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index 1a96c440182..7222a963ce3 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -1,16 +1,9 @@ import re import tempfile import unittest -from dataclasses import _MISSING_TYPE, asdict, fields from pathlib import Path -from datasets.utils.metadata import ( - DatasetMetadata, - metadata_dict_from_readme, - tagset_validator, - validate_metadata_type, - yaml_block_from_readme, -) +from datasets.utils.metadata import DatasetMetadata def _dedent(string: str) -> str: @@ -48,157 +41,26 @@ def _dedent(string: str) -> str: class TestMetadataUtils(unittest.TestCase): - def test_validate_metadata_type(self): - metadata_dict = { - "tag": ["list", "of", "values"], - "another tag": ["Another", {"list"}, ["of"], 0x646D46736457567A], - } - with self.assertRaises(TypeError): - validate_metadata_type(metadata_dict) - - metadata_dict = {"tag1": []} - with self.assertRaises(TypeError): - validate_metadata_type(metadata_dict) - - metadata_dict = {"tag1": None} - with self.assertRaises(TypeError): - validate_metadata_type(metadata_dict) - - def test_tagset_validator(self): - name = "test_tag" - url = "https://dummy.hf.co" - - items = ["tag1", "tag2", "tag2", "tag3"] - reference_values = ["tag1", "tag2", "tag3"] - returned_values, error = tagset_validator(items=items, reference_values=reference_values, name=name, url=url) - self.assertListEqual(returned_values, items) - self.assertIsNone(error) - - items = [] - reference_values = ["tag1", "tag2", "tag3"] - items, error = tagset_validator(items=items, reference_values=reference_values, name=name, url=url) - self.assertListEqual(items, []) - self.assertIsNone(error) - - items = [] - reference_values = [] - returned_values, error = tagset_validator(items=items, reference_values=reference_values, name=name, url=url) - self.assertListEqual(returned_values, []) - self.assertIsNone(error) - - items = ["tag1", "tag2", "tag2", "tag3", "unknown tag"] - reference_values = ["tag1", "tag2", "tag3"] - returned_values, error = tagset_validator(items=items, reference_values=reference_values, name=name, url=url) - self.assertListEqual(returned_values, []) - self.assertEqual(error, f"{['unknown tag']} are not registered tags for '{name}', reference at {url}") - - def predicate_fn(string): - return "ignore" in string - - items = ["process me", "process me too", "ignore me"] - reference_values = ["process me too"] - returned_values, error = tagset_validator( - items=items, - reference_values=reference_values, - name=name, - url=url, - escape_validation_predicate_fn=predicate_fn, - ) - self.assertListEqual(returned_values, []) - self.assertEqual(error, f"{['process me']} are not registered tags for '{name}', reference at {url}") - - items = ["process me", "process me too", "ignore me"] - reference_values = ["process me too", "process me"] - returned_values, error = tagset_validator( - items=items, - reference_values=reference_values, - name=name, - url=url, - escape_validation_predicate_fn=predicate_fn, - ) - self.assertListEqual(returned_values, items) - self.assertIsNone(error) - - items = ["ignore me too", "ignore me"] - reference_values = ["process me too"] - returned_values, error = tagset_validator( - items=items, - reference_values=reference_values, - name=name, - url=url, - escape_validation_predicate_fn=predicate_fn, - ) - self.assertListEqual(returned_values, items) - self.assertIsNone(error) - - def test_yaml_block_from_readme(self): - with tempfile.TemporaryDirectory() as tmp_dir: - path = Path(tmp_dir) / "README.md" - - with open(path, "w+") as readme_file: - readme_file.write(README_YAML) - yaml_block = yaml_block_from_readme(path=path) - self.assertEqual( - yaml_block, - _dedent( - """\ - language: - - zh - - en - task_ids: - - sentiment-classification - """ - ), - ) - - with open(path, "w+") as readme_file: - readme_file.write(README_EMPTY_YAML) - yaml_block = yaml_block_from_readme(path=path) - self.assertEqual( - yaml_block, - _dedent( - """\ - """ - ), - ) - - with open(path, "w+") as readme_file: - readme_file.write(README_NO_YAML) - yaml_block = yaml_block_from_readme(path=path) - self.assertIsNone(yaml_block) - def test_metadata_dict_from_readme(self): with tempfile.TemporaryDirectory() as tmp_dir: path = Path(tmp_dir) / "README.md" with open(path, "w+") as readme_file: readme_file.write(README_YAML) - metadata_dict = metadata_dict_from_readme(path) + metadata_dict = DatasetMetadata.from_readme(path) self.assertDictEqual(metadata_dict, {"language": ["zh", "en"], "task_ids": ["sentiment-classification"]}) with open(path, "w+") as readme_file: readme_file.write(README_EMPTY_YAML) - metadata_dict = metadata_dict_from_readme(path) + metadata_dict = DatasetMetadata.from_readme(path) self.assertDictEqual(metadata_dict, {}) with open(path, "w+") as readme_file: readme_file.write(README_NO_YAML) - metadata_dict = metadata_dict_from_readme(path) - self.assertIsNone(metadata_dict) + metadata_dict = DatasetMetadata.from_readme(path) + self.assertEqual(metadata_dict, {}) def test_from_yaml_string(self): - default_optional_keys = { - field.name: field.default - for field in fields(DatasetMetadata) - if type(field.default) is not _MISSING_TYPE and field.name not in DatasetMetadata._DEPRECATED_YAML_KEYS - } - - default_deprecated_keys = { - field.name: field.default - for field in fields(DatasetMetadata) - if field.name in DatasetMetadata._DEPRECATED_YAML_KEYS - } - valid_yaml_string = _dedent( """\ annotations_creators: @@ -222,87 +84,7 @@ def test_from_yaml_string(self): - open-domain-qa """ ) - DatasetMetadata.from_yaml_string(valid_yaml_string) - - valid_yaml_string_with_configs = _dedent( - """\ - annotations_creators: - - found - language_creators: - - found - language: - en: - - en - fr: - - fr - license: - - unknown - multilinguality: - - monolingual - pretty_name: Test Dataset - size_categories: - - 10K