Skip to content

Commit

Permalink
Add support for parsing JSON files in array form (#4997)
Browse files Browse the repository at this point in the history
* Support parsing JSON lists

* Add error handling

* Minor improvements

* Add tests

* Comment
  • Loading branch information
mariosasko committed Sep 20, 2022
1 parent ace149f commit 1a9385d
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 8 deletions.
44 changes: 37 additions & 7 deletions src/datasets/packaged_modules/json/json.py
Expand Up @@ -15,6 +15,25 @@
logger = datasets.utils.logging.get_logger(__name__)


if datasets.config.PYARROW_VERSION.major >= 7:

def pa_table_from_pylist(mapping):
return pa.Table.from_pylist(mapping)

else:

def pa_table_from_pylist(mapping):
# Copied from: https://github.com/apache/arrow/blob/master/python/pyarrow/table.pxi#L5193
arrays = []
names = []
if mapping:
names = list(mapping[0].keys())
for n in names:
v = [row[n] if n in row else None for row in mapping]
arrays.append(v)
return pa.Table.from_arrays(arrays, names)


@dataclass
class JsonConfig(datasets.BuilderConfig):
"""BuilderConfig for JSON."""
Expand Down Expand Up @@ -125,18 +144,29 @@ def _generate_tables(self, files):
)
block_size *= 2
except pa.ArrowInvalid as e:
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
try:
with open(file, encoding="utf-8") as f:
dataset = json.load(f)
except json.JSONDecodeError:
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
raise e
raise ValueError(
f"Not able to read records in the JSON file at {file}. "
f"You should probably indicate the field of the JSON file containing your records. "
f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
) from None
# If possible, parse the file as a list of json objects and exit the loop
if isinstance(dataset, list): # list is the only sequence type supported in JSON
try:
pa_table = pa_table_from_pylist(dataset)
except (pa.ArrowInvalid, AttributeError) as e:
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
raise ValueError(f"Not able to read records in the JSON file at {file}.") from None
yield file_idx, self._cast_table(pa_table)
break
else:
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
raise ValueError(
f"Not able to read records in the JSON file at {file}. "
f"You should probably indicate the field of the JSON file containing your records. "
f"This JSON file contain the following fields: {str(list(dataset.keys()))}. "
f"Select the correct one and provide it as `field='XXX'` to the dataset loading method. "
) from None
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
Expand Down
2 changes: 1 addition & 1 deletion tests/packaged_modules/test_csv.py
Expand Up @@ -12,7 +12,7 @@

@pytest.fixture
def csv_file(tmp_path):
filename = tmp_path / "malformed_file.csv"
filename = tmp_path / "file.csv"
data = textwrap.dedent(
"""\
header1,header2
Expand Down
71 changes: 71 additions & 0 deletions tests/packaged_modules/test_json.py
@@ -0,0 +1,71 @@
import textwrap

import pyarrow as pa
import pytest

from datasets.packaged_modules.json.json import Json


@pytest.fixture
def jsonl_file(tmp_path):
filename = tmp_path / "file.jsonl"
data = textwrap.dedent(
"""\
{"col_1": 1, "col_2": 2}
{"col_1": 10, "col_2": 20}
"""
)
with open(filename, "w") as f:
f.write(data)
return str(filename)


@pytest.fixture
def json_file_with_list_of_dicts(tmp_path):
filename = tmp_path / "file_with_list_of_dicts.json"
data = textwrap.dedent(
"""\
[
{"col_1": 1, "col_2": 2},
{"col_1": 10, "col_2": 20}
]
"""
)
with open(filename, "w") as f:
f.write(data)
return str(filename)


@pytest.fixture
def json_file_with_list_of_dicts_field(tmp_path):
filename = tmp_path / "file_with_list_of_dicts_field.json"
data = textwrap.dedent(
"""\
{
"field1": 1,
"field2": "aabb",
"field3": [
{"col_1": 1, "col_2": 2},
{"col_1": 10, "col_2": 20}
]
}
"""
)
with open(filename, "w") as f:
f.write(data)
return str(filename)


@pytest.mark.parametrize(
"file_fixture, config_kwargs",
[
("jsonl_file", {}),
("json_file_with_list_of_dicts", {}),
("json_file_with_list_of_dicts_field", {"field": "field3"}),
],
)
def test_json_generate_tables(file_fixture, config_kwargs, request):
json = Json(**config_kwargs)
generator = json._generate_tables([[request.getfixturevalue(file_fixture)]])
pa_table = pa.concat_tables([table for _, table in generator])
assert pa_table.to_pydict() == {"col_1": [1, 10], "col_2": [2, 20]}

1 comment on commit 1a9385d

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007959 / 0.011353 (-0.003394) 0.003787 / 0.011008 (-0.007221) 0.030957 / 0.038508 (-0.007551) 0.034467 / 0.023109 (0.011358) 0.294811 / 0.275898 (0.018913) 0.356999 / 0.323480 (0.033519) 0.005969 / 0.007986 (-0.002017) 0.003372 / 0.004328 (-0.000957) 0.006970 / 0.004250 (0.002719) 0.049548 / 0.037052 (0.012496) 0.310727 / 0.258489 (0.052238) 0.352133 / 0.293841 (0.058292) 0.031742 / 0.128546 (-0.096804) 0.009673 / 0.075646 (-0.065973) 0.262301 / 0.419271 (-0.156971) 0.052458 / 0.043533 (0.008925) 0.296423 / 0.255139 (0.041285) 0.320172 / 0.283200 (0.036972) 0.101515 / 0.141683 (-0.040167) 1.498441 / 1.452155 (0.046287) 1.524952 / 1.492716 (0.032236)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.216339 / 0.018006 (0.198333) 0.437944 / 0.000490 (0.437455) 0.001012 / 0.000200 (0.000812) 0.000083 / 0.000054 (0.000028)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.023196 / 0.037411 (-0.014215) 0.101299 / 0.014526 (0.086773) 0.116372 / 0.176557 (-0.060184) 0.161245 / 0.737135 (-0.575891) 0.117114 / 0.296338 (-0.179224)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.393979 / 0.215209 (0.178770) 3.938357 / 2.077655 (1.860702) 1.776972 / 1.504120 (0.272852) 1.555477 / 1.541195 (0.014282) 1.578771 / 1.468490 (0.110281) 0.423137 / 4.584777 (-4.161640) 3.743462 / 3.745712 (-0.002250) 2.039494 / 5.269862 (-3.230368) 1.436178 / 4.565676 (-3.129498) 0.051090 / 0.424275 (-0.373186) 0.010990 / 0.007607 (0.003383) 0.502640 / 0.226044 (0.276595) 5.086269 / 2.268929 (2.817341) 2.184557 / 55.444624 (-53.260067) 1.853991 / 6.876477 (-5.022486) 1.964622 / 2.142072 (-0.177450) 0.575854 / 4.805227 (-4.229373) 0.131675 / 6.500664 (-6.368989) 0.060967 / 0.075469 (-0.014502)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.472312 / 1.841788 (-0.369476) 13.569149 / 8.074308 (5.494841) 24.941361 / 10.191392 (14.749969) 0.894162 / 0.680424 (0.213739) 0.565933 / 0.534201 (0.031732) 0.384051 / 0.579283 (-0.195233) 0.451500 / 0.434364 (0.017137) 0.287228 / 0.540337 (-0.253110) 0.298179 / 1.386936 (-1.088757)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006170 / 0.011353 (-0.005183) 0.003983 / 0.011008 (-0.007025) 0.027869 / 0.038508 (-0.010640) 0.033788 / 0.023109 (0.010679) 0.384962 / 0.275898 (0.109064) 0.456076 / 0.323480 (0.132596) 0.003944 / 0.007986 (-0.004042) 0.004734 / 0.004328 (0.000406) 0.004881 / 0.004250 (0.000630) 0.045594 / 0.037052 (0.008542) 0.396258 / 0.258489 (0.137769) 0.426390 / 0.293841 (0.132549) 0.030094 / 0.128546 (-0.098452) 0.009485 / 0.075646 (-0.066161) 0.257109 / 0.419271 (-0.162162) 0.053898 / 0.043533 (0.010365) 0.391243 / 0.255139 (0.136104) 0.401964 / 0.283200 (0.118764) 0.103336 / 0.141683 (-0.038347) 1.476316 / 1.452155 (0.024162) 1.513441 / 1.492716 (0.020724)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.215817 / 0.018006 (0.197811) 0.443671 / 0.000490 (0.443181) 0.038253 / 0.000200 (0.038053) 0.000123 / 0.000054 (0.000069)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.023598 / 0.037411 (-0.013813) 0.100831 / 0.014526 (0.086305) 0.116845 / 0.176557 (-0.059711) 0.160772 / 0.737135 (-0.576363) 0.118510 / 0.296338 (-0.177828)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.439040 / 0.215209 (0.223831) 4.370071 / 2.077655 (2.292416) 2.199989 / 1.504120 (0.695869) 2.009754 / 1.541195 (0.468559) 2.049986 / 1.468490 (0.581496) 0.422118 / 4.584777 (-4.162659) 3.814278 / 3.745712 (0.068565) 2.066969 / 5.269862 (-3.202893) 1.232283 / 4.565676 (-3.333393) 0.052088 / 0.424275 (-0.372188) 0.010982 / 0.007607 (0.003375) 0.542175 / 0.226044 (0.316130) 5.410176 / 2.268929 (3.141248) 2.645184 / 55.444624 (-52.799440) 2.319530 / 6.876477 (-4.556947) 2.428364 / 2.142072 (0.286292) 0.530008 / 4.805227 (-4.275219) 0.119720 / 6.500664 (-6.380944) 0.061763 / 0.075469 (-0.013706)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.540664 / 1.841788 (-0.301123) 13.780928 / 8.074308 (5.706620) 25.210985 / 10.191392 (15.019593) 0.969034 / 0.680424 (0.288610) 0.622988 / 0.534201 (0.088787) 0.388446 / 0.579283 (-0.190837) 0.440178 / 0.434364 (0.005814) 0.270808 / 0.540337 (-0.269530) 0.287190 / 1.386936 (-1.099746)

CML watermark

Please sign in to comment.