From 02c5241dc4941fa6db20d620a232b5e7cf5f2349 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 11 Nov 2022 15:36:16 -0500 Subject: [PATCH 1/6] Updates for fastparquet evolution --- dask/dataframe/io/parquet/fastparquet.py | 2 +- dask/dataframe/io/tests/test_parquet.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dask/dataframe/io/parquet/fastparquet.py b/dask/dataframe/io/parquet/fastparquet.py index 67d6bcbd902..cc0cae69996 100644 --- a/dask/dataframe/io/parquet/fastparquet.py +++ b/dask/dataframe/io/parquet/fastparquet.py @@ -401,7 +401,7 @@ def _collect_dataset_info( # Find all files if we are not using a _metadata file if ignore_metadata_file or not _metadata_exists: # For now, we need to discover every file under paths[0] - paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs) + paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs, root=base) _update_paths = False for fn in ["_metadata", "_common_metadata"]: try: diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py index 57aeb4defc2..2b7f9f1fc03 100644 --- a/dask/dataframe/io/tests/test_parquet.py +++ b/dask/dataframe/io/tests/test_parquet.py @@ -3155,13 +3155,18 @@ def test_partitioned_column_overlap(tmpdir, engine, write_cols): else: path = str(tmpdir) - if write_cols == ["part", "kind", "col"]: + if engine == "fastparquet" and fastparquet_version > parse_version("0.8.3"): + # columns will change order and partitions will be categorical + result = dd.read_parquet(path, engine=engine) + expect = pd.concat([_df1, _df2], ignore_index=True) + assert result.compute().reset_index(drop=True).to_dict() == expect.to_dict() + elif write_cols == ["part", "kind", "col"]: result = dd.read_parquet(path, engine=engine) expect = pd.concat([_df1, _df2], ignore_index=True) assert_eq(result, expect, check_index=False) else: # For now, partial overlap between partition columns and - # real columns is not allowed + # real columns is not allowed for pyarrow or older fastparquet with pytest.raises(ValueError): dd.read_parquet(path, engine=engine) From dad3bf679295d86a932304a186c8b773a383fc48 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 14 Nov 2022 11:00:18 -0500 Subject: [PATCH 2/6] Change old error message --- dask/dataframe/io/parquet/fastparquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dask/dataframe/io/parquet/fastparquet.py b/dask/dataframe/io/parquet/fastparquet.py index cc0cae69996..cc4b44c6d70 100644 --- a/dask/dataframe/io/parquet/fastparquet.py +++ b/dask/dataframe/io/parquet/fastparquet.py @@ -490,6 +490,7 @@ def _collect_dataset_info( raise ValueError( "No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" + "This restruction is removed as of fastparquet 0.8.4\n" "columns: {} | partitions: {}".format(pf.columns, pf.cats.keys()) ) From 69a65227ddd7b25c264d6d232bfb168a86c372ae Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 14 Nov 2022 11:41:21 -0500 Subject: [PATCH 3/6] Update dask/dataframe/io/parquet/fastparquet.py Co-authored-by: Richard (Rick) Zamora --- dask/dataframe/io/parquet/fastparquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask/dataframe/io/parquet/fastparquet.py b/dask/dataframe/io/parquet/fastparquet.py index cc4b44c6d70..7d6e5934a11 100644 --- a/dask/dataframe/io/parquet/fastparquet.py +++ b/dask/dataframe/io/parquet/fastparquet.py @@ -490,7 +490,7 @@ def _collect_dataset_info( raise ValueError( "No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" - "This restruction is removed as of fastparquet 0.8.4\n" + "This restriction is removed as of fastparquet 0.8.4\n" "columns: {} | partitions: {}".format(pf.columns, pf.cats.keys()) ) From 240030b1293aa31fd86bc80907a744ee0a71c0b7 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Mon, 14 Nov 2022 16:13:44 -0600 Subject: [PATCH 4/6] test-upstream From 6ddd7f4a0a681767d1e34b6eb08da0aa9e4e3989 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Mon, 14 Nov 2022 16:14:05 -0600 Subject: [PATCH 5/6] test-upstream From 31849e8ce853afd9d9187ef04c1bc499534b6dc0 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 16 Nov 2022 14:45:28 -0600 Subject: [PATCH 6/6] Define expected once --- dask/dataframe/io/tests/test_parquet.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py index 2b7f9f1fc03..5e6e47a9f1f 100644 --- a/dask/dataframe/io/tests/test_parquet.py +++ b/dask/dataframe/io/tests/test_parquet.py @@ -3155,14 +3155,13 @@ def test_partitioned_column_overlap(tmpdir, engine, write_cols): else: path = str(tmpdir) + expect = pd.concat([_df1, _df2], ignore_index=True) if engine == "fastparquet" and fastparquet_version > parse_version("0.8.3"): # columns will change order and partitions will be categorical result = dd.read_parquet(path, engine=engine) - expect = pd.concat([_df1, _df2], ignore_index=True) assert result.compute().reset_index(drop=True).to_dict() == expect.to_dict() elif write_cols == ["part", "kind", "col"]: result = dd.read_parquet(path, engine=engine) - expect = pd.concat([_df1, _df2], ignore_index=True) assert_eq(result, expect, check_index=False) else: # For now, partial overlap between partition columns and