New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support dtype_backend="pandas|pyarrow"
configuration
#9719
Changes from 5 commits
9b69961
f79594e
472fbd4
fb3a25f
2ae283b
bf30884
911f36b
ed95fd5
0498e5c
e822b30
dd80bb8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,12 @@ | |
import dask.dataframe as dd | ||
import dask.multiprocessing | ||
from dask.blockwise import Blockwise, optimize_blockwise | ||
from dask.dataframe._compat import PANDAS_GT_110, PANDAS_GT_121, PANDAS_GT_130 | ||
from dask.dataframe._compat import ( | ||
PANDAS_GT_110, | ||
PANDAS_GT_121, | ||
PANDAS_GT_130, | ||
PANDAS_GT_150, | ||
) | ||
from dask.dataframe.io.parquet.core import get_engine | ||
from dask.dataframe.io.parquet.utils import _parse_pandas_metadata | ||
from dask.dataframe.optimize import optimize_dataframe_getitem | ||
|
@@ -618,17 +623,41 @@ def test_roundtrip_nullable_dtypes(tmp_path, write_engine, read_engine): | |
|
||
|
||
@PYARROW_MARK | ||
def test_use_nullable_dtypes(tmp_path, engine): | ||
@pytest.mark.parametrize( | ||
"use_nullable_dtypes", | ||
[ | ||
True, | ||
"pandas", | ||
pytest.param( | ||
"pyarrow", | ||
marks=pytest.mark.skipif( | ||
not PANDAS_GT_150, reason="Requires pyarrow-backed nullable dtypes" | ||
), | ||
), | ||
], | ||
) | ||
def test_use_nullable_dtypes(tmp_path, engine, use_nullable_dtypes): | ||
""" | ||
Test reading a parquet file without pandas metadata, | ||
but forcing use of nullable dtypes where appropriate | ||
""" | ||
|
||
if use_nullable_dtypes in (True, "pandas"): | ||
nullable_backend = "" | ||
else: | ||
nullable_backend = "[pyarrow]" | ||
df = pd.DataFrame( | ||
{ | ||
"a": pd.Series([1, 2, pd.NA, 3, 4], dtype="Int64"), | ||
"b": pd.Series([True, pd.NA, False, True, False], dtype="boolean"), | ||
"c": pd.Series([0.1, 0.2, 0.3, pd.NA, 0.4], dtype="Float64"), | ||
"d": pd.Series(["a", "b", "c", "d", pd.NA], dtype="string"), | ||
"a": pd.Series([1, 2, pd.NA, 3, 4], dtype=f"Int64{nullable_backend}"), | ||
"b": pd.Series( | ||
[True, pd.NA, False, True, False], dtype=f"boolean{nullable_backend}" | ||
), | ||
"c": pd.Series( | ||
[0.1, 0.2, 0.3, pd.NA, 0.4], dtype=f"Float64{nullable_backend}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh nice, I actually didn't know this was case insensitive. (The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, |
||
), | ||
"d": pd.Series( | ||
["a", "b", "c", "d", pd.NA], dtype=f"string{nullable_backend}" | ||
), | ||
} | ||
) | ||
ddf = dd.from_pandas(df, npartitions=2) | ||
|
@@ -647,7 +676,9 @@ def write_partition(df, i): | |
# Not supported by fastparquet | ||
if engine == "fastparquet": | ||
with pytest.raises(ValueError, match="`use_nullable_dtypes` is not supported"): | ||
dd.read_parquet(tmp_path, engine=engine, use_nullable_dtypes=True) | ||
dd.read_parquet( | ||
tmp_path, engine=engine, use_nullable_dtypes=use_nullable_dtypes | ||
) | ||
|
||
# Works in pyarrow | ||
else: | ||
|
@@ -657,10 +688,30 @@ def write_partition(df, i): | |
assert_eq(df, ddf2) | ||
|
||
# Round trip works when we use nullable dtypes | ||
ddf2 = dd.read_parquet(tmp_path, engine=engine, use_nullable_dtypes=True) | ||
ddf2 = dd.read_parquet( | ||
tmp_path, engine=engine, use_nullable_dtypes=use_nullable_dtypes | ||
) | ||
assert_eq(df, ddf2, check_index=False) | ||
|
||
|
||
def test_use_nullable_dtypes_raises(tmp_path, engine): | ||
# Raise an informative error message when `use_nullable_dtypes` is invalid | ||
df = pd.DataFrame({"a": pd.Series([1, 2, pd.NA, 3, 4], dtype="Int64")}) | ||
ddf = dd.from_pandas(df, npartitions=3) | ||
ddf.to_parquet(tmp_path, engine=engine) | ||
|
||
bad_use_nullable_dtypes = "not-a-valid-option" | ||
with pytest.raises(ValueError) as excinfo: | ||
dd.read_parquet( | ||
tmp_path, | ||
engine=engine, | ||
use_nullable_dtypes=bad_use_nullable_dtypes, | ||
) | ||
msg = str(excinfo.value) | ||
assert "Invalid value for `use_nullable_dtypes`" in msg | ||
assert bad_use_nullable_dtypes in msg | ||
|
||
|
||
@pytest.mark.xfail( | ||
not PANDAS_GT_130, | ||
reason=( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Btw, @mroeschke this is the PR I mentioned offline about extending
use_nullable_dtypes
to support"pandas"
and"pyarrow"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yeah this is pretty clean!
Our pandas issues pandas-dev/pandas#48957 (offline discussion happened here) and pandas-dev/pandas#49997 are examples where some discussion/preference of keeping
use_nullable_dtypes
boolean