Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates for fastparquet evolution #9650

Merged
merged 8 commits into from Nov 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion dask/dataframe/io/parquet/fastparquet.py
Expand Up @@ -401,7 +401,7 @@ def _collect_dataset_info(
# Find all files if we are not using a _metadata file
if ignore_metadata_file or not _metadata_exists:
# For now, we need to discover every file under paths[0]
paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs)
paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs, root=base)
_update_paths = False
for fn in ["_metadata", "_common_metadata"]:
try:
Expand Down Expand Up @@ -490,6 +490,7 @@ def _collect_dataset_info(
raise ValueError(
"No partition-columns should be written in the \n"
"file unless they are ALL written in the file.\n"
"This restriction is removed as of fastparquet 0.8.4\n"
"columns: {} | partitions: {}".format(pf.columns, pf.cats.keys())
)

Expand Down
10 changes: 7 additions & 3 deletions dask/dataframe/io/tests/test_parquet.py
Expand Up @@ -3155,13 +3155,17 @@ def test_partitioned_column_overlap(tmpdir, engine, write_cols):
else:
path = str(tmpdir)

if write_cols == ["part", "kind", "col"]:
expect = pd.concat([_df1, _df2], ignore_index=True)
if engine == "fastparquet" and fastparquet_version > parse_version("0.8.3"):
# columns will change order and partitions will be categorical
result = dd.read_parquet(path, engine=engine)
assert result.compute().reset_index(drop=True).to_dict() == expect.to_dict()
elif write_cols == ["part", "kind", "col"]:
result = dd.read_parquet(path, engine=engine)
expect = pd.concat([_df1, _df2], ignore_index=True)
assert_eq(result, expect, check_index=False)
else:
# For now, partial overlap between partition columns and
# real columns is not allowed
# real columns is not allowed for pyarrow or older fastparquet
martindurant marked this conversation as resolved.
Show resolved Hide resolved
with pytest.raises(ValueError):
dd.read_parquet(path, engine=engine)

Expand Down