From 7b92701661f493f81b38a94ab86b3fc12b5857ab Mon Sep 17 00:00:00 2001 From: Sander Land Date: Thu, 25 Aug 2022 11:05:02 +0200 Subject: [PATCH 1/7] add Dataset.from_list --- docs/source/loading.mdx | 10 +++++++++ src/datasets/arrow_dataset.py | 41 +++++++++++++++++++++++++++++++++-- src/datasets/table.py | 18 +++++++++++++++ tests/test_dataset_list.py | 30 +++++++++++++++++++++++++ 4 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 tests/test_dataset_list.py diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index 54b209d0a9b..55a1b6d72f9 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -205,6 +205,16 @@ Load Python dictionaries with [`~Dataset.from_dict`]: >>> dataset = Dataset.from_dict(my_dict) ``` +### Python list of dictionaries + +Load a list of Python dictionaries with [`~Dataset.from_list`]: + +```py +>>> from datasets import Dataset +>>> my_list = [{"a": 1}, {"a": 2}] +>>> dataset = Dataset.from_list(my_list) +``` + ### Pandas DataFrame Load Pandas DataFrames with [`~Dataset.from_pandas`]: diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index bc2c5ab37ba..b65a7d142b8 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -63,7 +63,13 @@ from .download.download_config import DownloadConfig from .download.streaming_download_manager import xgetsize from .features import Audio, ClassLabel, Features, Image, Sequence, Value -from .features.features import FeatureType, decode_nested_example, pandas_types_mapper, require_decoding +from .features.features import ( + FeatureType, + decode_nested_example, + generate_from_arrow_type, + pandas_types_mapper, + require_decoding, +) from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform, @@ -842,7 +848,7 @@ def from_dict( split: Optional[NamedSplit] = None, ) -> "Dataset": """ - Convert :obj:`dict` to a :obj:`pyarrow.Table` to create a :class:`Dataset`. + Convert a list of dicts to a :obj:`pyarrow.Table` to create a :class:`Dataset`. Args: mapping (:obj:`Mapping`): Mapping of strings to Arrays or Python lists. @@ -872,6 +878,37 @@ def from_dict( info.features = Features({col: ts.get_inferred_type() for col, ts in mapping.items()}) return cls(pa_table, info=info, split=split) + @classmethod + def from_list( + cls, + mapping: List[dict], + features: Optional[Features] = None, + info: Optional[DatasetInfo] = None, + split: Optional[NamedSplit] = None, + ) -> "Dataset": + """ + Convert :obj:`list of dicts` to a :obj:`pyarrow.Table` to create a :class:`Dataset`. + + Args: + mapping (:obj:`List[dict]`): A list of mappings of strings to row values. + features (:class:`Features`, optional): Dataset features. + info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc. + split (:class:`NamedSplit`, optional): Name of the dataset split. + + Returns: + :class:`Dataset` + """ + if info is not None and features is not None and info.features != features: + raise ValueError( + f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}" + ) + features = features if features is not None else info.features if info is not None else None + if info is None: + info = DatasetInfo() + info.features = features + pa_table = InMemoryTable.from_pylist(mapping=mapping) + return cls(pa_table, info=info, split=split) + @staticmethod def from_csv( path_or_paths: Union[PathLike, List[PathLike]], diff --git a/src/datasets/table.py b/src/datasets/table.py index 64da5e5e28e..c644bd840de 100644 --- a/src/datasets/table.py +++ b/src/datasets/table.py @@ -749,6 +749,24 @@ def from_pydict(cls, *args, **kwargs): """ return cls(pa.Table.from_pydict(*args, **kwargs)) + @classmethod + def from_pylist(cls, *args, **kwargs): + """ + Construct a Table from list of rows / dictionaries. + + Args: + mapping (:obj:`List[dict]`): + A mapping of strings to row values. + schema (:obj:`Schema`, defaults to :obj:`None`): + If not passed, will be inferred from the Mapping values + metadata (:obj:`Union[dict, Mapping]`, default None): + Optional metadata for the schema (if inferred). + + Returns: + :class:`datasets.table.Table`: + """ + return cls(pa.Table.from_pylist(*args, **kwargs)) + @classmethod def from_batches(cls, *args, **kwargs): """ diff --git a/tests/test_dataset_list.py b/tests/test_dataset_list.py new file mode 100644 index 00000000000..f416e1e812d --- /dev/null +++ b/tests/test_dataset_list.py @@ -0,0 +1,30 @@ +from unittest import TestCase + +from datasets.arrow_dataset import Dataset + + +class DatasetListTest(TestCase): + def _create_example_records(self): + return [ + {"col_1": 3, "col_2": "a"}, + {"col_1": 2, "col_2": "b"}, + {"col_1": 1, "col_2": "c"}, + {"col_1": 0, "col_2": "d"}, + ] + + def _create_example_dict(self): + data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} + return Dataset.from_dict(data) + + def test_create(self): + example_records = self._create_example_records() + dset = Dataset.from_list(example_records) + self.assertListEqual(dset.column_names, ["col_1", "col_2"]) + for i, r in enumerate(dset): + self.assertDictEqual(r, example_records[i]) + + def test_list_dict_equivalent(self): + example_records = self._create_example_records() + dset = Dataset.from_list(example_records) + dset_from_dict = Dataset.from_dict({k: [r[k] for r in example_records] for k in example_records[0]}) + self.assertEqual(dset.info, dset_from_dict.info) From 5a02d5ba1c594d13e7599053b05a3d1d7dff260e Mon Sep 17 00:00:00 2001 From: Sander Land Date: Thu, 25 Aug 2022 11:20:51 +0200 Subject: [PATCH 2/7] add more tests, clean up --- src/datasets/arrow_dataset.py | 15 ++++++--------- tests/test_dataset_list.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index b65a7d142b8..2a7486b52c7 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -63,13 +63,7 @@ from .download.download_config import DownloadConfig from .download.streaming_download_manager import xgetsize from .features import Audio, ClassLabel, Features, Image, Sequence, Value -from .features.features import ( - FeatureType, - decode_nested_example, - generate_from_arrow_type, - pandas_types_mapper, - require_decoding, -) +from .features.features import FeatureType, decode_nested_example, pandas_types_mapper, require_decoding from .filesystems import extract_path_from_uri, is_remote_filesystem from .fingerprint import ( fingerprint_transform, @@ -848,7 +842,7 @@ def from_dict( split: Optional[NamedSplit] = None, ) -> "Dataset": """ - Convert a list of dicts to a :obj:`pyarrow.Table` to create a :class:`Dataset`. + Convert :obj:`dict` to a :obj:`pyarrow.Table` to create a :class:`Dataset`. Args: mapping (:obj:`Mapping`): Mapping of strings to Arrays or Python lists. @@ -887,7 +881,10 @@ def from_list( split: Optional[NamedSplit] = None, ) -> "Dataset": """ - Convert :obj:`list of dicts` to a :obj:`pyarrow.Table` to create a :class:`Dataset`. + Convert a list of dicts to a :obj:`pyarrow.Table` to create a :class:`Dataset`. + + Note that the keys of the first entry will be used to determine the dataset columns, + regardless of what is passed to features. Args: mapping (:obj:`List[dict]`): A list of mappings of strings to row values. diff --git a/tests/test_dataset_list.py b/tests/test_dataset_list.py index f416e1e812d..29f898dae40 100644 --- a/tests/test_dataset_list.py +++ b/tests/test_dataset_list.py @@ -1,5 +1,6 @@ from unittest import TestCase +from datasets import Sequence, Value from datasets.arrow_dataset import Dataset @@ -28,3 +29,14 @@ def test_list_dict_equivalent(self): dset = Dataset.from_list(example_records) dset_from_dict = Dataset.from_dict({k: [r[k] for r in example_records] for k in example_records[0]}) self.assertEqual(dset.info, dset_from_dict.info) + + def test_uneven_records(self): # checks what happens with missing columns + uneven_records = [{"col_1": 1}, {"col_2": "x"}] + dset = Dataset.from_list(uneven_records) + self.assertDictEqual(dset[0], {"col_1": 1}) + self.assertDictEqual(dset[1], {"col_1": None}) # NB: first record is used for columns + + def test_variable_list_records(self): # checks if the type can be inferred from the second record + list_records = [{"col_1": []}, {"col_1": [1, 2]}] + dset = Dataset.from_list(list_records) + self.assertEqual(dset.info.features["col_1"], Sequence(Value("int64"))) From d1faaf5324c3c2bb25be855af47593aaa150cf6f Mon Sep 17 00:00:00 2001 From: Sander Land Date: Thu, 25 Aug 2022 16:28:53 +0200 Subject: [PATCH 3/7] pyarrow 6 backward compatibility --- src/datasets/table.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/datasets/table.py b/src/datasets/table.py index c644bd840de..b12e78697e3 100644 --- a/src/datasets/table.py +++ b/src/datasets/table.py @@ -750,7 +750,7 @@ def from_pydict(cls, *args, **kwargs): return cls(pa.Table.from_pydict(*args, **kwargs)) @classmethod - def from_pylist(cls, *args, **kwargs): + def from_pylist(cls, mapping, *args, **kwargs): """ Construct a Table from list of rows / dictionaries. @@ -765,7 +765,11 @@ def from_pylist(cls, *args, **kwargs): Returns: :class:`datasets.table.Table`: """ - return cls(pa.Table.from_pylist(*args, **kwargs)) + try: + return cls(pa.Table.from_pylist(mapping, *args, **kwargs)) + except AttributeError: # pyarrow <7 does not have from_pylist, so we convert and use from_pydict + mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} + return cls(pa.Table.from_pydict(mapping, *args, **kwargs)) @classmethod def from_batches(cls, *args, **kwargs): From 6fe2973e92875bd0904c09b38164adff482deb1e Mon Sep 17 00:00:00 2001 From: Sander Land <48946947+sanderland@users.noreply.github.com> Date: Tue, 30 Aug 2022 12:10:39 +0200 Subject: [PATCH 4/7] Update docs/source/loading.mdx Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- docs/source/loading.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index 55a1b6d72f9..1d482e2a937 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -211,7 +211,7 @@ Load a list of Python dictionaries with [`~Dataset.from_list`]: ```py >>> from datasets import Dataset ->>> my_list = [{"a": 1}, {"a": 2}] +>>> my_list = [{"a": 1}, {"a": 2}, {"a": 3}] >>> dataset = Dataset.from_list(my_list) ``` From a59b37d073fab8511faa113f63ce850f76c45ecb Mon Sep 17 00:00:00 2001 From: Sander Land <48946947+sanderland@users.noreply.github.com> Date: Tue, 30 Aug 2022 12:13:55 +0200 Subject: [PATCH 5/7] Update src/datasets/table.py Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- src/datasets/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/table.py b/src/datasets/table.py index b12e78697e3..abebb0a84de 100644 --- a/src/datasets/table.py +++ b/src/datasets/table.py @@ -768,7 +768,7 @@ def from_pylist(cls, mapping, *args, **kwargs): try: return cls(pa.Table.from_pylist(mapping, *args, **kwargs)) except AttributeError: # pyarrow <7 does not have from_pylist, so we convert and use from_pydict - mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} + mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} if mapping else {} return cls(pa.Table.from_pydict(mapping, *args, **kwargs)) @classmethod From 718fe1953131e379e958b0123e47c064bfab59e3 Mon Sep 17 00:00:00 2001 From: Sander Land Date: Tue, 30 Aug 2022 12:29:28 +0200 Subject: [PATCH 6/7] use from_dict in from_list, and add tests --- src/datasets/arrow_dataset.py | 13 +++---------- tests/test_dataset_list.py | 5 +++++ tests/test_table.py | 8 ++++++++ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 2a7486b52c7..288254cbc9c 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -895,16 +895,9 @@ def from_list( Returns: :class:`Dataset` """ - if info is not None and features is not None and info.features != features: - raise ValueError( - f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}" - ) - features = features if features is not None else info.features if info is not None else None - if info is None: - info = DatasetInfo() - info.features = features - pa_table = InMemoryTable.from_pylist(mapping=mapping) - return cls(pa_table, info=info, split=split) + # for simplicity and consistency wrt OptimizedTypedSequence we do not use InMemoryTable.from_pylist here + mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} if mapping else {} + return cls.from_dict(mapping, features, info, split) @staticmethod def from_csv( diff --git a/tests/test_dataset_list.py b/tests/test_dataset_list.py index 29f898dae40..1004ae3cd68 100644 --- a/tests/test_dataset_list.py +++ b/tests/test_dataset_list.py @@ -40,3 +40,8 @@ def test_variable_list_records(self): # checks if the type can be inferred from list_records = [{"col_1": []}, {"col_1": [1, 2]}] dset = Dataset.from_list(list_records) self.assertEqual(dset.info.features["col_1"], Sequence(Value("int64"))) + + def test_create_empty(self): + dset = Dataset.from_list([]) + self.assertEqual(len(dset), 0) + self.assertListEqual(dset.column_names, []) diff --git a/tests/test_table.py b/tests/test_table.py index 073c8b0482e..5e0319d61a3 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -255,6 +255,14 @@ def test_in_memory_table_from_pydict(in_memory_pa_table): assert table.table == pa.Table.from_pydict(pydict) +def test_in_memory_table_from_pylist(in_memory_pa_table): + pylist = in_memory_pa_table.to_pylist() + with assert_arrow_memory_increases(): + table = InMemoryTable.from_pylist(pylist) + assert isinstance(table, InMemoryTable) + assert table.table == pa.Table.from_pylist(pylist) + + def test_in_memory_table_from_batches(in_memory_pa_table): batches = list(in_memory_pa_table.to_batches()) table = InMemoryTable.from_batches(batches) From 658abe29e91e01891ae06597cd7ff492cb0b01ba Mon Sep 17 00:00:00 2001 From: Sander Land Date: Thu, 1 Sep 2022 18:47:01 +0200 Subject: [PATCH 7/7] fix test for pyarrow 6, add to_pylist --- src/datasets/table.py | 13 +++++++++++++ tests/test_table.py | 9 ++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/datasets/table.py b/src/datasets/table.py index abebb0a84de..6ea551626b4 100644 --- a/src/datasets/table.py +++ b/src/datasets/table.py @@ -252,6 +252,19 @@ def to_pydict(self, *args, **kwargs): """ return self.table.to_pydict(*args, **kwargs) + def to_pylist(self, *args, **kwargs): + """ + Convert the Table to a list + + Returns: + :obj:`list` + """ + try: + return self.table.to_pylist(*args, **kwargs) + except AttributeError: # pyarrow <7 does not have to_pylist, so we use to_pydict + pydict = self.table.to_pydict(*args, **kwargs) + return [{k: pydict[k][i] for k in pydict} for i in range(len(self.table))] + def to_pandas(self, *args, **kwargs): """ Convert to a pandas-compatible NumPy array or DataFrame, as appropriate diff --git a/tests/test_table.py b/tests/test_table.py index 5e0319d61a3..4bb31900dea 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -256,11 +256,10 @@ def test_in_memory_table_from_pydict(in_memory_pa_table): def test_in_memory_table_from_pylist(in_memory_pa_table): - pylist = in_memory_pa_table.to_pylist() - with assert_arrow_memory_increases(): - table = InMemoryTable.from_pylist(pylist) - assert isinstance(table, InMemoryTable) - assert table.table == pa.Table.from_pylist(pylist) + pylist = InMemoryTable(in_memory_pa_table).to_pylist() + table = InMemoryTable.from_pylist(pylist) + assert isinstance(table, InMemoryTable) + assert pylist == table.to_pylist() def test_in_memory_table_from_batches(in_memory_pa_table):