Skip to content

Commit

Permalink
Test
Browse files Browse the repository at this point in the history
  • Loading branch information
mariosasko committed Oct 7, 2022
1 parent c7a1961 commit 353bec3
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions tests/test_arrow_dataset.py
Expand Up @@ -3356,6 +3356,25 @@ def _check_sql_dataset(dataset, expected_features):
assert dataset.features[feature].dtype == expected_dtype


@require_sqlalchemy
@pytest.mark.parametrize("con_type", ["string", "engine"])
def test_dataset_from_sql_con_type(con_type, sqlite_path, tmp_path):
cache_dir = tmp_path / "cache"
expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
if con_type == "string":
con = "sqlite:///" + sqlite_path
elif con_type == "engine":
import sqlalchemy

con = sqlalchemy.create_engine("sqlite:///" + sqlite_path)
dataset = Dataset.from_sql(
"dataset",
con,
cache_dir=cache_dir,
)
_check_sql_dataset(dataset, expected_features)


@require_sqlalchemy
@pytest.mark.parametrize(
"features",
Expand Down

1 comment on commit 353bec3

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006948 / 0.011353 (-0.004405) 0.003609 / 0.011008 (-0.007399) 0.026278 / 0.038508 (-0.012230) 0.030951 / 0.023109 (0.007842) 0.257892 / 0.275898 (-0.018006) 0.318659 / 0.323480 (-0.004821) 0.005269 / 0.007986 (-0.002717) 0.003195 / 0.004328 (-0.001133) 0.006174 / 0.004250 (0.001923) 0.040734 / 0.037052 (0.003681) 0.271828 / 0.258489 (0.013339) 0.300336 / 0.293841 (0.006495) 0.027692 / 0.128546 (-0.100854) 0.008335 / 0.075646 (-0.067311) 0.228896 / 0.419271 (-0.190375) 0.045683 / 0.043533 (0.002150) 0.258452 / 0.255139 (0.003313) 0.282906 / 0.283200 (-0.000294) 0.092489 / 0.141683 (-0.049194) 1.249725 / 1.452155 (-0.202429) 1.283872 / 1.492716 (-0.208844)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.199263 / 0.018006 (0.181257) 0.433305 / 0.000490 (0.432816) 0.004616 / 0.000200 (0.004416) 0.000078 / 0.000054 (0.000023)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.022058 / 0.037411 (-0.015353) 0.089176 / 0.014526 (0.074650) 0.103331 / 0.176557 (-0.073225) 0.144993 / 0.737135 (-0.592142) 0.106418 / 0.296338 (-0.189921)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.350389 / 0.215209 (0.135180) 3.458638 / 2.077655 (1.380983) 1.565697 / 1.504120 (0.061577) 1.391658 / 1.541195 (-0.149537) 1.419007 / 1.468490 (-0.049483) 0.365952 / 4.584777 (-4.218825) 3.335780 / 3.745712 (-0.409932) 1.725798 / 5.269862 (-3.544063) 1.057318 / 4.565676 (-3.508358) 0.044281 / 0.424275 (-0.379994) 0.009662 / 0.007607 (0.002055) 0.433918 / 0.226044 (0.207873) 4.374468 / 2.268929 (2.105539) 1.909366 / 55.444624 (-53.535259) 1.621089 / 6.876477 (-5.255387) 1.721073 / 2.142072 (-0.420999) 0.466108 / 4.805227 (-4.339119) 0.101901 / 6.500664 (-6.398763) 0.051918 / 0.075469 (-0.023551)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.287895 / 1.841788 (-0.553893) 12.101495 / 8.074308 (4.027187) 22.228171 / 10.191392 (12.036779) 0.731314 / 0.680424 (0.050891) 0.486924 / 0.534201 (-0.047277) 0.342435 / 0.579283 (-0.236848) 0.384790 / 0.434364 (-0.049574) 0.239783 / 0.540337 (-0.300555) 0.235933 / 1.386936 (-1.151003)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.005566 / 0.011353 (-0.005787) 0.003641 / 0.011008 (-0.007367) 0.023867 / 0.038508 (-0.014641) 0.030137 / 0.023109 (0.007028) 0.344246 / 0.275898 (0.068348) 0.351681 / 0.323480 (0.028201) 0.003618 / 0.007986 (-0.004368) 0.003190 / 0.004328 (-0.001139) 0.004316 / 0.004250 (0.000066) 0.036504 / 0.037052 (-0.000548) 0.341781 / 0.258489 (0.083292) 0.362399 / 0.293841 (0.068558) 0.023059 / 0.128546 (-0.105487) 0.006344 / 0.075646 (-0.069302) 0.221001 / 0.419271 (-0.198270) 0.045309 / 0.043533 (0.001776) 0.331614 / 0.255139 (0.076475) 0.349165 / 0.283200 (0.065965) 0.089719 / 0.141683 (-0.051964) 1.248211 / 1.452155 (-0.203944) 1.355514 / 1.492716 (-0.137202)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.220761 / 0.018006 (0.202755) 0.432733 / 0.000490 (0.432244) 0.001182 / 0.000200 (0.000982) 0.000070 / 0.000054 (0.000015)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.020875 / 0.037411 (-0.016536) 0.089263 / 0.014526 (0.074738) 0.100134 / 0.176557 (-0.076423) 0.137652 / 0.737135 (-0.599483) 0.104438 / 0.296338 (-0.191900)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.369805 / 0.215209 (0.154596) 3.678346 / 2.077655 (1.600691) 1.751129 / 1.504120 (0.247009) 1.591562 / 1.541195 (0.050367) 1.686909 / 1.468490 (0.218419) 0.376791 / 4.584777 (-4.207986) 3.430543 / 3.745712 (-0.315169) 3.317289 / 5.269862 (-1.952572) 1.649999 / 4.565676 (-2.915677) 0.051463 / 0.424275 (-0.372812) 0.010924 / 0.007607 (0.003317) 0.527113 / 0.226044 (0.301068) 5.282897 / 2.268929 (3.013968) 2.485171 / 55.444624 (-52.959453) 2.182741 / 6.876477 (-4.693736) 2.232479 / 2.142072 (0.090406) 0.536872 / 4.805227 (-4.268355) 0.120637 / 6.500664 (-6.380027) 0.061150 / 0.075469 (-0.014319)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.361497 / 1.841788 (-0.480291) 12.155851 / 8.074308 (4.081543) 11.930845 / 10.191392 (1.739453) 0.824759 / 0.680424 (0.144335) 0.537140 / 0.534201 (0.002939) 0.326274 / 0.579283 (-0.253009) 0.371747 / 0.434364 (-0.062617) 0.219323 / 0.540337 (-0.321014) 0.221644 / 1.386936 (-1.165292)

CML watermark

Please sign in to comment.