Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Sep 29, 2022
1 parent e4472cf commit f45116e
Showing 1 changed file with 12 additions and 16 deletions.
28 changes: 12 additions & 16 deletions tests/test_load.py
Expand Up @@ -862,6 +862,18 @@ def assert_auth(url, *args, headers, **kwargs):
mock_head.assert_called()


@pytest.mark.integration
def test_load_streaming_private_dataset(hf_token, hf_private_dataset_repo_txt_data):
ds = load_dataset(hf_private_dataset_repo_txt_data, streaming=True)
assert next(iter(ds)) is not None


@pytest.mark.integration
def test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data):
ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True)
assert next(iter(ds)) is not None


@require_pil
@pytest.mark.integration
@pytest.mark.parametrize("implicit_token", [False, True])
Expand All @@ -878,22 +890,6 @@ def test_load_dataset_private_zipped_images(
assert len(ds_items) == 2


@pytest.mark.integration
def test_load_streaming_private_dataset(hf_token, hf_private_dataset_repo_txt_data):
with pytest.raises(FileNotFoundError):
load_dataset(hf_private_dataset_repo_txt_data, streaming=True)
ds = load_dataset(hf_private_dataset_repo_txt_data, streaming=True, use_auth_token=hf_token)
assert next(iter(ds)) is not None


@pytest.mark.integration
def test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data):
with pytest.raises(FileNotFoundError):
load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True)
ds = load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True, use_auth_token=hf_token)
assert next(iter(ds)) is not None


def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir, tmp_path, caplog):
cache_dir1 = tmp_path / "cache1"
cache_dir2 = tmp_path / "cache2"
Expand Down

1 comment on commit f45116e

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.009792 / 0.011353 (-0.001561) 0.007299 / 0.011008 (-0.003710) 0.037125 / 0.038508 (-0.001383) 0.044767 / 0.023109 (0.021657) 0.370953 / 0.275898 (0.095055) 0.458146 / 0.323480 (0.134666) 0.007482 / 0.007986 (-0.000503) 0.004130 / 0.004328 (-0.000198) 0.008518 / 0.004250 (0.004268) 0.057565 / 0.037052 (0.020513) 0.375453 / 0.258489 (0.116964) 0.436416 / 0.293841 (0.142575) 0.037834 / 0.128546 (-0.090712) 0.011633 / 0.075646 (-0.064014) 0.322201 / 0.419271 (-0.097071) 0.062352 / 0.043533 (0.018820) 0.380534 / 0.255139 (0.125395) 0.395830 / 0.283200 (0.112630) 0.129548 / 0.141683 (-0.012134) 1.843556 / 1.452155 (0.391401) 1.842049 / 1.492716 (0.349333)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.229763 / 0.018006 (0.211757) 0.512249 / 0.000490 (0.511759) 0.001339 / 0.000200 (0.001140) 0.000168 / 0.000054 (0.000114)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.028605 / 0.037411 (-0.008806) 0.127179 / 0.014526 (0.112653) 0.136145 / 0.176557 (-0.040412) 0.192285 / 0.737135 (-0.544851) 0.141763 / 0.296338 (-0.154575)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.489595 / 0.215209 (0.274386) 4.858732 / 2.077655 (2.781078) 2.220593 / 1.504120 (0.716473) 2.021256 / 1.541195 (0.480061) 2.061907 / 1.468490 (0.593417) 0.524550 / 4.584777 (-4.060227) 4.444957 / 3.745712 (0.699245) 2.294822 / 5.269862 (-2.975040) 1.710816 / 4.565676 (-2.854860) 0.057282 / 0.424275 (-0.366993) 0.012786 / 0.007607 (0.005179) 0.568286 / 0.226044 (0.342242) 5.598104 / 2.268929 (3.329175) 2.574086 / 55.444624 (-52.870539) 2.241160 / 6.876477 (-4.635317) 2.319821 / 2.142072 (0.177749) 0.613677 / 4.805227 (-4.191550) 0.138753 / 6.500664 (-6.361911) 0.072072 / 0.075469 (-0.003397)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.828900 / 1.841788 (-0.012888) 16.759324 / 8.074308 (8.685016) 29.435661 / 10.191392 (19.244269) 1.000469 / 0.680424 (0.320045) 0.668215 / 0.534201 (0.134014) 0.472477 / 0.579283 (-0.106806) 0.544875 / 0.434364 (0.110511) 0.335145 / 0.540337 (-0.205193) 0.342891 / 1.386936 (-1.044045)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007477 / 0.011353 (-0.003876) 0.004825 / 0.011008 (-0.006183) 0.034166 / 0.038508 (-0.004343) 0.041258 / 0.023109 (0.018149) 0.434562 / 0.275898 (0.158664) 0.531486 / 0.323480 (0.208006) 0.004632 / 0.007986 (-0.003353) 0.004087 / 0.004328 (-0.000242) 0.006129 / 0.004250 (0.001878) 0.050541 / 0.037052 (0.013489) 0.440374 / 0.258489 (0.181885) 0.510802 / 0.293841 (0.216961) 0.036644 / 0.128546 (-0.091902) 0.013852 / 0.075646 (-0.061795) 0.316498 / 0.419271 (-0.102774) 0.065570 / 0.043533 (0.022037) 0.450680 / 0.255139 (0.195541) 0.470889 / 0.283200 (0.187689) 0.123567 / 0.141683 (-0.018116) 1.796888 / 1.452155 (0.344733) 1.819800 / 1.492716 (0.327083)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.311530 / 0.018006 (0.293523) 0.492039 / 0.000490 (0.491550) 0.026890 / 0.000200 (0.026690) 0.000408 / 0.000054 (0.000353)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.028566 / 0.037411 (-0.008845) 0.130815 / 0.014526 (0.116289) 0.138635 / 0.176557 (-0.037921) 0.195120 / 0.737135 (-0.542015) 0.143014 / 0.296338 (-0.153324)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.507546 / 0.215209 (0.292337) 5.054546 / 2.077655 (2.976892) 2.388905 / 1.504120 (0.884785) 2.177009 / 1.541195 (0.635814) 2.160555 / 1.468490 (0.692064) 0.499621 / 4.584777 (-4.085156) 4.587663 / 3.745712 (0.841950) 2.292722 / 5.269862 (-2.977139) 1.489402 / 4.565676 (-3.076274) 0.057164 / 0.424275 (-0.367111) 0.012537 / 0.007607 (0.004930) 0.597963 / 0.226044 (0.371919) 6.044235 / 2.268929 (3.775306) 2.918435 / 55.444624 (-52.526189) 2.601173 / 6.876477 (-4.275304) 2.807580 / 2.142072 (0.665508) 0.639089 / 4.805227 (-4.166138) 0.143865 / 6.500664 (-6.356800) 0.072091 / 0.075469 (-0.003378)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.752101 / 1.841788 (-0.089687) 16.145844 / 8.074308 (8.071536) 29.877286 / 10.191392 (19.685894) 1.099370 / 0.680424 (0.418946) 0.752957 / 0.534201 (0.218756) 0.469082 / 0.579283 (-0.110201) 0.515916 / 0.434364 (0.081552) 0.320956 / 0.540337 (-0.219381) 0.340351 / 1.386936 (-1.046585)

CML watermark

Please sign in to comment.