From 7b92701661f493f81b38a94ab86b3fc12b5857ab Mon Sep 17 00:00:00 2001
From: Sander Land <sander@chatdesk.com>
Date: Thu, 25 Aug 2022 11:05:02 +0200
Subject: [PATCH 1/7] add Dataset.from_list

---
 docs/source/loading.mdx       | 10 +++++++++
 src/datasets/arrow_dataset.py | 41 +++++++++++++++++++++++++++++++++--
 src/datasets/table.py         | 18 +++++++++++++++
 tests/test_dataset_list.py    | 30 +++++++++++++++++++++++++
 4 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_dataset_list.py

diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx
index 54b209d0a9b..55a1b6d72f9 100644
--- a/docs/source/loading.mdx
+++ b/docs/source/loading.mdx
@@ -205,6 +205,16 @@ Load Python dictionaries with [`~Dataset.from_dict`]:
 >>> dataset = Dataset.from_dict(my_dict)
 ```
 
+### Python list of dictionaries
+
+Load a list of Python dictionaries with [`~Dataset.from_list`]:
+
+```py
+>>> from datasets import Dataset
+>>> my_list = [{"a": 1}, {"a": 2}]
+>>> dataset = Dataset.from_list(my_list)
+```
+
 ### Pandas DataFrame
 
 Load Pandas DataFrames with [`~Dataset.from_pandas`]:
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index bc2c5ab37ba..b65a7d142b8 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -63,7 +63,13 @@
 from .download.download_config import DownloadConfig
 from .download.streaming_download_manager import xgetsize
 from .features import Audio, ClassLabel, Features, Image, Sequence, Value
-from .features.features import FeatureType, decode_nested_example, pandas_types_mapper, require_decoding
+from .features.features import (
+    FeatureType,
+    decode_nested_example,
+    generate_from_arrow_type,
+    pandas_types_mapper,
+    require_decoding,
+)
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,
@@ -842,7 +848,7 @@ def from_dict(
         split: Optional[NamedSplit] = None,
     ) -> "Dataset":
         """
-        Convert :obj:`dict` to a :obj:`pyarrow.Table` to create a :class:`Dataset`.
+        Convert a list of dicts to a :obj:`pyarrow.Table` to create a :class:`Dataset`.
 
         Args:
             mapping (:obj:`Mapping`): Mapping of strings to Arrays or Python lists.
@@ -872,6 +878,37 @@ def from_dict(
             info.features = Features({col: ts.get_inferred_type() for col, ts in mapping.items()})
         return cls(pa_table, info=info, split=split)
 
+    @classmethod
+    def from_list(
+        cls,
+        mapping: List[dict],
+        features: Optional[Features] = None,
+        info: Optional[DatasetInfo] = None,
+        split: Optional[NamedSplit] = None,
+    ) -> "Dataset":
+        """
+        Convert :obj:`list of dicts` to a :obj:`pyarrow.Table` to create a :class:`Dataset`.
+
+        Args:
+            mapping (:obj:`List[dict]`): A list of mappings of strings to row values.
+            features (:class:`Features`, optional): Dataset features.
+            info (:class:`DatasetInfo`, optional): Dataset information, like description, citation, etc.
+            split (:class:`NamedSplit`, optional): Name of the dataset split.
+
+        Returns:
+            :class:`Dataset`
+        """
+        if info is not None and features is not None and info.features != features:
+            raise ValueError(
+                f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
+            )
+        features = features if features is not None else info.features if info is not None else None
+        if info is None:
+            info = DatasetInfo()
+        info.features = features
+        pa_table = InMemoryTable.from_pylist(mapping=mapping)
+        return cls(pa_table, info=info, split=split)
+
     @staticmethod
     def from_csv(
         path_or_paths: Union[PathLike, List[PathLike]],
diff --git a/src/datasets/table.py b/src/datasets/table.py
index 64da5e5e28e..c644bd840de 100644
--- a/src/datasets/table.py
+++ b/src/datasets/table.py
@@ -749,6 +749,24 @@ def from_pydict(cls, *args, **kwargs):
         """
         return cls(pa.Table.from_pydict(*args, **kwargs))
 
+    @classmethod
+    def from_pylist(cls, *args, **kwargs):
+        """
+        Construct a Table from list of rows / dictionaries.
+
+        Args:
+            mapping (:obj:`List[dict]`):
+                A mapping of strings to row values.
+            schema (:obj:`Schema`, defaults to :obj:`None`):
+                If not passed, will be inferred from the Mapping values
+            metadata (:obj:`Union[dict, Mapping]`, default None):
+                Optional metadata for the schema (if inferred).
+
+        Returns:
+            :class:`datasets.table.Table`:
+        """
+        return cls(pa.Table.from_pylist(*args, **kwargs))
+
     @classmethod
     def from_batches(cls, *args, **kwargs):
         """
diff --git a/tests/test_dataset_list.py b/tests/test_dataset_list.py
new file mode 100644
index 00000000000..f416e1e812d
--- /dev/null
+++ b/tests/test_dataset_list.py
@@ -0,0 +1,30 @@
+from unittest import TestCase
+
+from datasets.arrow_dataset import Dataset
+
+
+class DatasetListTest(TestCase):
+    def _create_example_records(self):
+        return [
+            {"col_1": 3, "col_2": "a"},
+            {"col_1": 2, "col_2": "b"},
+            {"col_1": 1, "col_2": "c"},
+            {"col_1": 0, "col_2": "d"},
+        ]
+
+    def _create_example_dict(self):
+        data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
+        return Dataset.from_dict(data)
+
+    def test_create(self):
+        example_records = self._create_example_records()
+        dset = Dataset.from_list(example_records)
+        self.assertListEqual(dset.column_names, ["col_1", "col_2"])
+        for i, r in enumerate(dset):
+            self.assertDictEqual(r, example_records[i])
+
+    def test_list_dict_equivalent(self):
+        example_records = self._create_example_records()
+        dset = Dataset.from_list(example_records)
+        dset_from_dict = Dataset.from_dict({k: [r[k] for r in example_records] for k in example_records[0]})
+        self.assertEqual(dset.info, dset_from_dict.info)

From 5a02d5ba1c594d13e7599053b05a3d1d7dff260e Mon Sep 17 00:00:00 2001
From: Sander Land <sander@chatdesk.com>
Date: Thu, 25 Aug 2022 11:20:51 +0200
Subject: [PATCH 2/7] add more tests, clean up

---
 src/datasets/arrow_dataset.py | 15 ++++++---------
 tests/test_dataset_list.py    | 12 ++++++++++++
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index b65a7d142b8..2a7486b52c7 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -63,13 +63,7 @@
 from .download.download_config import DownloadConfig
 from .download.streaming_download_manager import xgetsize
 from .features import Audio, ClassLabel, Features, Image, Sequence, Value
-from .features.features import (
-    FeatureType,
-    decode_nested_example,
-    generate_from_arrow_type,
-    pandas_types_mapper,
-    require_decoding,
-)
+from .features.features import FeatureType, decode_nested_example, pandas_types_mapper, require_decoding
 from .filesystems import extract_path_from_uri, is_remote_filesystem
 from .fingerprint import (
     fingerprint_transform,
@@ -848,7 +842,7 @@ def from_dict(
         split: Optional[NamedSplit] = None,
     ) -> "Dataset":
         """
-        Convert a list of dicts to a :obj:`pyarrow.Table` to create a :class:`Dataset`.
+        Convert :obj:`dict` to a :obj:`pyarrow.Table` to create a :class:`Dataset`.
 
         Args:
             mapping (:obj:`Mapping`): Mapping of strings to Arrays or Python lists.
@@ -887,7 +881,10 @@ def from_list(
         split: Optional[NamedSplit] = None,
     ) -> "Dataset":
         """
-        Convert :obj:`list of dicts` to a :obj:`pyarrow.Table` to create a :class:`Dataset`.
+        Convert a list of dicts to a :obj:`pyarrow.Table` to create a :class:`Dataset`.
+
+        Note that the keys of the first entry will be used to determine the dataset columns,
+        regardless of what is passed to features.
 
         Args:
             mapping (:obj:`List[dict]`): A list of mappings of strings to row values.
diff --git a/tests/test_dataset_list.py b/tests/test_dataset_list.py
index f416e1e812d..29f898dae40 100644
--- a/tests/test_dataset_list.py
+++ b/tests/test_dataset_list.py
@@ -1,5 +1,6 @@
 from unittest import TestCase
 
+from datasets import Sequence, Value
 from datasets.arrow_dataset import Dataset
 
 
@@ -28,3 +29,14 @@ def test_list_dict_equivalent(self):
         dset = Dataset.from_list(example_records)
         dset_from_dict = Dataset.from_dict({k: [r[k] for r in example_records] for k in example_records[0]})
         self.assertEqual(dset.info, dset_from_dict.info)
+
+    def test_uneven_records(self):  # checks what happens with missing columns
+        uneven_records = [{"col_1": 1}, {"col_2": "x"}]
+        dset = Dataset.from_list(uneven_records)
+        self.assertDictEqual(dset[0], {"col_1": 1})
+        self.assertDictEqual(dset[1], {"col_1": None})  # NB: first record is used for columns
+
+    def test_variable_list_records(self):  # checks if the type can be inferred from the second record
+        list_records = [{"col_1": []}, {"col_1": [1, 2]}]
+        dset = Dataset.from_list(list_records)
+        self.assertEqual(dset.info.features["col_1"], Sequence(Value("int64")))

From d1faaf5324c3c2bb25be855af47593aaa150cf6f Mon Sep 17 00:00:00 2001
From: Sander Land <sander@chatdesk.com>
Date: Thu, 25 Aug 2022 16:28:53 +0200
Subject: [PATCH 3/7] pyarrow 6 backward compatibility

---
 src/datasets/table.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/datasets/table.py b/src/datasets/table.py
index c644bd840de..b12e78697e3 100644
--- a/src/datasets/table.py
+++ b/src/datasets/table.py
@@ -750,7 +750,7 @@ def from_pydict(cls, *args, **kwargs):
         return cls(pa.Table.from_pydict(*args, **kwargs))
 
     @classmethod
-    def from_pylist(cls, *args, **kwargs):
+    def from_pylist(cls, mapping, *args, **kwargs):
         """
         Construct a Table from list of rows / dictionaries.
 
@@ -765,7 +765,11 @@ def from_pylist(cls, *args, **kwargs):
         Returns:
             :class:`datasets.table.Table`:
         """
-        return cls(pa.Table.from_pylist(*args, **kwargs))
+        try:
+            return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
+        except AttributeError:  # pyarrow <7 does not have from_pylist, so we convert and use from_pydict
+            mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]}
+            return cls(pa.Table.from_pydict(mapping, *args, **kwargs))
 
     @classmethod
     def from_batches(cls, *args, **kwargs):

From 6fe2973e92875bd0904c09b38164adff482deb1e Mon Sep 17 00:00:00 2001
From: Sander Land <48946947+sanderland@users.noreply.github.com>
Date: Tue, 30 Aug 2022 12:10:39 +0200
Subject: [PATCH 4/7] Update docs/source/loading.mdx

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 docs/source/loading.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx
index 55a1b6d72f9..1d482e2a937 100644
--- a/docs/source/loading.mdx
+++ b/docs/source/loading.mdx
@@ -211,7 +211,7 @@ Load a list of Python dictionaries with [`~Dataset.from_list`]:
 
 ```py
 >>> from datasets import Dataset
->>> my_list = [{"a": 1}, {"a": 2}]
+>>> my_list = [{"a": 1}, {"a": 2}, {"a": 3}]
 >>> dataset = Dataset.from_list(my_list)
 ```
 

From a59b37d073fab8511faa113f63ce850f76c45ecb Mon Sep 17 00:00:00 2001
From: Sander Land <48946947+sanderland@users.noreply.github.com>
Date: Tue, 30 Aug 2022 12:13:55 +0200
Subject: [PATCH 5/7] Update src/datasets/table.py

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 src/datasets/table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/table.py b/src/datasets/table.py
index b12e78697e3..abebb0a84de 100644
--- a/src/datasets/table.py
+++ b/src/datasets/table.py
@@ -768,7 +768,7 @@ def from_pylist(cls, mapping, *args, **kwargs):
         try:
             return cls(pa.Table.from_pylist(mapping, *args, **kwargs))
         except AttributeError:  # pyarrow <7 does not have from_pylist, so we convert and use from_pydict
-            mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]}
+            mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} if mapping else {}
             return cls(pa.Table.from_pydict(mapping, *args, **kwargs))
 
     @classmethod

From 718fe1953131e379e958b0123e47c064bfab59e3 Mon Sep 17 00:00:00 2001
From: Sander Land <sander@chatdesk.com>
Date: Tue, 30 Aug 2022 12:29:28 +0200
Subject: [PATCH 6/7] use from_dict in from_list, and add tests

---
 src/datasets/arrow_dataset.py | 13 +++----------
 tests/test_dataset_list.py    |  5 +++++
 tests/test_table.py           |  8 ++++++++
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 2a7486b52c7..288254cbc9c 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -895,16 +895,9 @@ def from_list(
         Returns:
             :class:`Dataset`
         """
-        if info is not None and features is not None and info.features != features:
-            raise ValueError(
-                f"Features specified in `features` and `info.features` can't be different:\n{features}\n{info.features}"
-            )
-        features = features if features is not None else info.features if info is not None else None
-        if info is None:
-            info = DatasetInfo()
-        info.features = features
-        pa_table = InMemoryTable.from_pylist(mapping=mapping)
-        return cls(pa_table, info=info, split=split)
+        # for simplicity and consistency wrt OptimizedTypedSequence we do not use InMemoryTable.from_pylist here
+        mapping = {k: [r.get(k) for r in mapping] for k in mapping[0]} if mapping else {}
+        return cls.from_dict(mapping, features, info, split)
 
     @staticmethod
     def from_csv(
diff --git a/tests/test_dataset_list.py b/tests/test_dataset_list.py
index 29f898dae40..1004ae3cd68 100644
--- a/tests/test_dataset_list.py
+++ b/tests/test_dataset_list.py
@@ -40,3 +40,8 @@ def test_variable_list_records(self):  # checks if the type can be inferred from
         list_records = [{"col_1": []}, {"col_1": [1, 2]}]
         dset = Dataset.from_list(list_records)
         self.assertEqual(dset.info.features["col_1"], Sequence(Value("int64")))
+
+    def test_create_empty(self):
+        dset = Dataset.from_list([])
+        self.assertEqual(len(dset), 0)
+        self.assertListEqual(dset.column_names, [])
diff --git a/tests/test_table.py b/tests/test_table.py
index 073c8b0482e..5e0319d61a3 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -255,6 +255,14 @@ def test_in_memory_table_from_pydict(in_memory_pa_table):
         assert table.table == pa.Table.from_pydict(pydict)
 
 
+def test_in_memory_table_from_pylist(in_memory_pa_table):
+    pylist = in_memory_pa_table.to_pylist()
+    with assert_arrow_memory_increases():
+        table = InMemoryTable.from_pylist(pylist)
+        assert isinstance(table, InMemoryTable)
+        assert table.table == pa.Table.from_pylist(pylist)
+
+
 def test_in_memory_table_from_batches(in_memory_pa_table):
     batches = list(in_memory_pa_table.to_batches())
     table = InMemoryTable.from_batches(batches)

From 658abe29e91e01891ae06597cd7ff492cb0b01ba Mon Sep 17 00:00:00 2001
From: Sander Land <sander@chatdesk.com>
Date: Thu, 1 Sep 2022 18:47:01 +0200
Subject: [PATCH 7/7] fix test for pyarrow 6, add to_pylist

---
 src/datasets/table.py | 13 +++++++++++++
 tests/test_table.py   |  9 ++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/datasets/table.py b/src/datasets/table.py
index abebb0a84de..6ea551626b4 100644
--- a/src/datasets/table.py
+++ b/src/datasets/table.py
@@ -252,6 +252,19 @@ def to_pydict(self, *args, **kwargs):
         """
         return self.table.to_pydict(*args, **kwargs)
 
+    def to_pylist(self, *args, **kwargs):
+        """
+        Convert the Table to a list
+
+        Returns:
+            :obj:`list`
+        """
+        try:
+            return self.table.to_pylist(*args, **kwargs)
+        except AttributeError:  # pyarrow <7 does not have to_pylist, so we use to_pydict
+            pydict = self.table.to_pydict(*args, **kwargs)
+            return [{k: pydict[k][i] for k in pydict} for i in range(len(self.table))]
+
     def to_pandas(self, *args, **kwargs):
         """
         Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
diff --git a/tests/test_table.py b/tests/test_table.py
index 5e0319d61a3..4bb31900dea 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -256,11 +256,10 @@ def test_in_memory_table_from_pydict(in_memory_pa_table):
 
 
 def test_in_memory_table_from_pylist(in_memory_pa_table):
-    pylist = in_memory_pa_table.to_pylist()
-    with assert_arrow_memory_increases():
-        table = InMemoryTable.from_pylist(pylist)
-        assert isinstance(table, InMemoryTable)
-        assert table.table == pa.Table.from_pylist(pylist)
+    pylist = InMemoryTable(in_memory_pa_table).to_pylist()
+    table = InMemoryTable.from_pylist(pylist)
+    assert isinstance(table, InMemoryTable)
+    assert pylist == table.to_pylist()
 
 
 def test_in_memory_table_from_batches(in_memory_pa_table):