Skip to content

Commit

Permalink
Handle arrow table with date32 columns
Browse files Browse the repository at this point in the history
  • Loading branch information
jonmmease committed Mar 22, 2024
1 parent 71eea06 commit afd8caa
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 6 deletions.
2 changes: 2 additions & 0 deletions altair/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
infer_vegalite_type,
infer_encoding_types,
sanitize_dataframe,
sanitize_arrow_table,
parse_shorthand,
use_signature,
update_nested,
Expand All @@ -18,6 +19,7 @@
"infer_vegalite_type",
"infer_encoding_types",
"sanitize_dataframe",
"sanitize_arrow_table",
"spec_to_html",
"parse_shorthand",
"use_signature",
Expand Down
8 changes: 4 additions & 4 deletions altair/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,15 +429,15 @@ def sanitize_arrow_table(pa_table):
schema = pa_table.schema
for name in schema.names:
array = pa_table[name]
dtype = schema.field(name).type
if str(dtype).startswith("timestamp"):
dtype_name = str(schema.field(name).type)
if dtype_name.startswith("timestamp") or dtype_name.startswith("date32"):
arrays.append(pc.strftime(array))
elif str(dtype).startswith("duration"):
elif dtype_name.startswith("duration"):
raise ValueError(
'Field "{col_name}" has type "{dtype}" which is '
"not supported by Altair. Please convert to "
"either a timestamp or a numerical value."
"".format(col_name=name, dtype=dtype)
"".format(col_name=name, dtype=dtype_name)
)
else:
arrays.append(array)
Expand Down
2 changes: 1 addition & 1 deletion altair/vegalite/v5/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _dataset_name(values: Union[dict, list, core.InlineDataset]) -> str:
values = values.to_dict()
if values == [{}]:
return "empty"
values_json = json.dumps(values, sort_keys=True)
values_json = json.dumps(values, sort_keys=True, default=str)
hsh = hashlib.sha256(values_json.encode()).hexdigest()[:32]
return "data-" + hsh

Expand Down
49 changes: 48 additions & 1 deletion tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
import pytest

from altair.utils import infer_vegalite_type, sanitize_dataframe
from altair.utils import infer_vegalite_type, sanitize_dataframe, sanitize_arrow_table

try:
import pyarrow as pa
Expand Down Expand Up @@ -120,6 +120,53 @@ def test_sanitize_dataframe_arrow_columns():
json.dumps(records)


@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
def test_sanitize_pyarrow_table_columns():
# create a dataframe with various types
df = pd.DataFrame(
{
"s": list("abcde"),
"f": np.arange(5, dtype=float),
"i": np.arange(5, dtype=int),
"b": np.array([True, False, True, True, False]),
"d": pd.date_range("2012-01-01", periods=5, freq="H"),
"c": pd.Series(list("ababc"), dtype="category"),
"p": pd.date_range("2012-01-01", periods=5, freq="H").tz_localize("UTC"),
}
)

# Create pyarrow table with explicit schema so that date32 type is preserved
pa_table = pa.Table.from_pandas(
df,
pa.schema(
[
pa.field("s", pa.string()),
pa.field("f", pa.float64()),
pa.field("i", pa.int64()),
pa.field("b", pa.bool_()),
pa.field("d", pa.date32()),
pa.field("c", pa.dictionary(pa.int8(), pa.string())),
pa.field("p", pa.timestamp("ns", tz="UTC")),
]
),
)
sanitized = sanitize_arrow_table(pa_table)
values = sanitized.to_pylist()

assert values[0] == {
"s": "a",
"f": 0.0,
"i": 0,
"b": True,
"d": "2012-01-01T00:00:00",
"c": "a",
"p": "2012-01-01T00:00:00.000000000",
}

# Make sure we can serialize to JSON without error
json.dumps(values)


def test_sanitize_dataframe_colnames():
df = pd.DataFrame(np.arange(12).reshape(4, 3))

Expand Down

0 comments on commit afd8caa

Please sign in to comment.