Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restore saved format state in load_from_disk #5073

Merged
merged 3 commits into from Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/datasets/arrow_dataset.py
Expand Up @@ -1346,13 +1346,23 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
split = state["_split"]
split = Split(split) if split is not None else split

return Dataset(
dataset = Dataset(
arrow_table=arrow_table,
info=dataset_info,
split=split,
fingerprint=state["_fingerprint"],
)

format = {
"type": state["_format_type"],
"format_kwargs": state["_format_kwargs"],
"columns": state["_format_columns"],
"output_all_columns": state["_output_all_columns"],
}
dataset = dataset.with_format(**format)

return dataset

@property
def data(self) -> Table:
"""The Apache Arrow table backing the dataset.
Expand Down
11 changes: 11 additions & 0 deletions tests/test_arrow_dataset.py
Expand Up @@ -318,6 +318,17 @@ def test_dummy_dataset_load_from_disk(self, in_memory):
self.assertEqual(dset[0]["filename"], "my_name-train_0")
self.assertEqual(dset["filename"][0], "my_name-train_0")

def test_restore_saved_format(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:

with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:
dset.set_format(type="numpy", columns=["col_1"], output_all_columns=True)
dataset_path = os.path.join(tmp_dir, "my_dataset")
dset.save_to_disk(dataset_path)

with load_from_disk(dataset_path) as loaded_dset:
self.assertEqual(dset.format, loaded_dset.format)

def test_set_format_numpy_multiple_columns(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:
with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:
Expand Down