From 0a872af5d20db2ebaa9de987046cf0d0ee377567 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 28 Apr 2022 15:14:50 +0200 Subject: [PATCH 1/8] CI: force nightly pyarrow in the upstream build --- continuous_integration/scripts/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/continuous_integration/scripts/install.sh b/continuous_integration/scripts/install.sh index 6e491a36f54..200f9e49dec 100644 --- a/continuous_integration/scripts/install.sh +++ b/continuous_integration/scripts/install.sh @@ -7,7 +7,7 @@ set -xe # python -m pip install --no-deps cityhash if [[ ${UPSTREAM_DEV} ]]; then - mamba install -y -c arrow-nightlies "pyarrow>5.0" + mamba install -y -c arrow-nightlies "pyarrow>7.0" # FIXME https://github.com/mamba-org/mamba/issues/412 # mamba uninstall --force numpy pandas fastparquet From 984b39fe8bd6150881123e4c14bed23a774e591b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 28 Apr 2022 15:17:06 +0200 Subject: [PATCH 2/8] test-upstream From f5fa063221a2f7a63b03e0d32124d627f328e528 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 May 2022 10:59:08 +0200 Subject: [PATCH 3/8] temporary skip hanging tests [test-upstream] --- dask/bytes/tests/test_s3.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/dask/bytes/tests/test_s3.py b/dask/bytes/tests/test_s3.py index a85b529c04d..74fd39bbc2b 100644 --- a/dask/bytes/tests/test_s3.py +++ b/dask/bytes/tests/test_s3.py @@ -455,6 +455,9 @@ def test_parquet(s3, engine, s3so, metadata_file): ): pytest.skip("#7056 - new s3fs not supported before pyarrow 3.0") + if engine == "pyarrow" and lib_version > parse_version("7.0.0"): + pytest.skip("#8993 - parquet dataset with s3 failing on pyarrow master") + url = "s3://%s/test.parquet" % test_bucket_name data = pd.DataFrame( @@ -555,6 +558,12 @@ def test_parquet_append(s3, engine, s3so): pd = pytest.importorskip("pandas") np = pytest.importorskip("numpy") + # TEMP + lib = pytest.importorskip(engine) + lib_version = parse_version(lib.__version__) + if engine == "pyarrow" and lib_version > parse_version("7.0.0"): + pytest.skip("#8993 - parquet dataset with s3 failing on pyarrow master") + url = "s3://%s/test.parquet.append" % test_bucket_name data = pd.DataFrame( @@ -609,6 +618,12 @@ def test_parquet_wstoragepars(s3, s3so, engine): pd = pytest.importorskip("pandas") np = pytest.importorskip("numpy") + # TEMP + lib = pytest.importorskip(engine) + lib_version = parse_version(lib.__version__) + if engine == "pyarrow" and lib_version > parse_version("7.0.0"): + pytest.skip("#8993 - parquet dataset with s3 failing on pyarrow master") + url = "s3://%s/test.parquet" % test_bucket_name data = pd.DataFrame({"i32": np.array([0, 5, 2, 5])}) From d7be2e9876ee15e3210815c1fef84af5ca1f6f3e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 May 2022 15:46:49 +0200 Subject: [PATCH 4/8] fix check_compression [test-upstream] --- dask/dataframe/io/tests/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py index da457f06027..0ac62b6c95a 100644 --- a/dask/dataframe/io/tests/test_parquet.py +++ b/dask/dataframe/io/tests/test_parquet.py @@ -1662,7 +1662,7 @@ def check_compression(engine, filename, compression): else: assert md.total_compressed_size != md.total_uncompressed_size else: - metadata = pa.parquet.ParquetDataset(filename).metadata + metadata = pa.parquet.read_metadata(filename + "/_metadata") names = metadata.schema.names for i in range(metadata.num_row_groups): row_group = metadata.row_group(i) From 626d1dc1158c7793f4208bb993496deca2cf973d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 May 2022 13:00:39 +0200 Subject: [PATCH 5/8] remove temp skips [test-upstream] --- dask/bytes/tests/test_s3.py | 15 --------------- dask/dataframe/io/tests/test_parquet.py | 2 +- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/dask/bytes/tests/test_s3.py b/dask/bytes/tests/test_s3.py index 74fd39bbc2b..a85b529c04d 100644 --- a/dask/bytes/tests/test_s3.py +++ b/dask/bytes/tests/test_s3.py @@ -455,9 +455,6 @@ def test_parquet(s3, engine, s3so, metadata_file): ): pytest.skip("#7056 - new s3fs not supported before pyarrow 3.0") - if engine == "pyarrow" and lib_version > parse_version("7.0.0"): - pytest.skip("#8993 - parquet dataset with s3 failing on pyarrow master") - url = "s3://%s/test.parquet" % test_bucket_name data = pd.DataFrame( @@ -558,12 +555,6 @@ def test_parquet_append(s3, engine, s3so): pd = pytest.importorskip("pandas") np = pytest.importorskip("numpy") - # TEMP - lib = pytest.importorskip(engine) - lib_version = parse_version(lib.__version__) - if engine == "pyarrow" and lib_version > parse_version("7.0.0"): - pytest.skip("#8993 - parquet dataset with s3 failing on pyarrow master") - url = "s3://%s/test.parquet.append" % test_bucket_name data = pd.DataFrame( @@ -618,12 +609,6 @@ def test_parquet_wstoragepars(s3, s3so, engine): pd = pytest.importorskip("pandas") np = pytest.importorskip("numpy") - # TEMP - lib = pytest.importorskip(engine) - lib_version = parse_version(lib.__version__) - if engine == "pyarrow" and lib_version > parse_version("7.0.0"): - pytest.skip("#8993 - parquet dataset with s3 failing on pyarrow master") - url = "s3://%s/test.parquet" % test_bucket_name data = pd.DataFrame({"i32": np.array([0, 5, 2, 5])}) diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py index 0ac62b6c95a..c76e1146bff 100644 --- a/dask/dataframe/io/tests/test_parquet.py +++ b/dask/dataframe/io/tests/test_parquet.py @@ -1662,7 +1662,7 @@ def check_compression(engine, filename, compression): else: assert md.total_compressed_size != md.total_uncompressed_size else: - metadata = pa.parquet.read_metadata(filename + "/_metadata") + metadata = pa.parquet.read_metadata(os.path.join(filename, "_metadata")) names = metadata.schema.names for i in range(metadata.num_row_groups): row_group = metadata.row_group(i) From 33cced5e83eaa05b548b8d427dea0bc56603d882 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 5 May 2022 09:22:54 +0200 Subject: [PATCH 6/8] [test-upstream] From 8afb7ea336a39fc51d00b31d108910cd7610be9b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 May 2022 11:39:10 +0200 Subject: [PATCH 7/8] [test-upstream] From 0c713717f8d40636a64bec766038a06909cc12af Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 May 2022 08:26:57 +0200 Subject: [PATCH 8/8] [test-upstream]