Handle arrow table with date32 columns

vega · Mar 22, 2024 · afd8caa · afd8caa
1 parent 71eea06
commit afd8caa
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 6 deletions.
diff --git a/altair/utils/__init__.py b/altair/utils/__init__.py
@@ -2,6 +2,7 @@
     infer_vegalite_type,
     infer_encoding_types,
     sanitize_dataframe,
+    sanitize_arrow_table,
     parse_shorthand,
     use_signature,
     update_nested,
@@ -18,6 +19,7 @@
     "infer_vegalite_type",
     "infer_encoding_types",
     "sanitize_dataframe",
+    "sanitize_arrow_table",
     "spec_to_html",
     "parse_shorthand",
     "use_signature",

diff --git a/altair/utils/core.py b/altair/utils/core.py
@@ -429,15 +429,15 @@ def sanitize_arrow_table(pa_table):
     schema = pa_table.schema
     for name in schema.names:
         array = pa_table[name]
-        dtype = schema.field(name).type
-        if str(dtype).startswith("timestamp"):
+        dtype_name = str(schema.field(name).type)
+        if dtype_name.startswith("timestamp") or dtype_name.startswith("date32"):
             arrays.append(pc.strftime(array))
-        elif str(dtype).startswith("duration"):
+        elif dtype_name.startswith("duration"):
             raise ValueError(
                 'Field "{col_name}" has type "{dtype}" which is '
                 "not supported by Altair. Please convert to "
                 "either a timestamp or a numerical value."
-                "".format(col_name=name, dtype=dtype)
+                "".format(col_name=name, dtype=dtype_name)
             )
         else:
             arrays.append(array)

diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py
@@ -56,7 +56,7 @@ def _dataset_name(values: Union[dict, list, core.InlineDataset]) -> str:
         values = values.to_dict()
     if values == [{}]:
         return "empty"
-    values_json = json.dumps(values, sort_keys=True)
+    values_json = json.dumps(values, sort_keys=True, default=str)
     hsh = hashlib.sha256(values_json.encode()).hexdigest()[:32]
     return "data-" + hsh
 

diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import pytest
 
-from altair.utils import infer_vegalite_type, sanitize_dataframe
+from altair.utils import infer_vegalite_type, sanitize_dataframe, sanitize_arrow_table
 
 try:
     import pyarrow as pa
@@ -120,6 +120,53 @@ def test_sanitize_dataframe_arrow_columns():
     json.dumps(records)
 
 
+@pytest.mark.skipif(pa is None, reason="pyarrow not installed")
+def test_sanitize_pyarrow_table_columns():
+    # create a dataframe with various types
+    df = pd.DataFrame(
+        {
+            "s": list("abcde"),
+            "f": np.arange(5, dtype=float),
+            "i": np.arange(5, dtype=int),
+            "b": np.array([True, False, True, True, False]),
+            "d": pd.date_range("2012-01-01", periods=5, freq="H"),
+            "c": pd.Series(list("ababc"), dtype="category"),
+            "p": pd.date_range("2012-01-01", periods=5, freq="H").tz_localize("UTC"),
+        }
+    )
+
+    # Create pyarrow table with explicit schema so that date32 type is preserved
+    pa_table = pa.Table.from_pandas(
+        df,
+        pa.schema(
+            [
+                pa.field("s", pa.string()),
+                pa.field("f", pa.float64()),
+                pa.field("i", pa.int64()),
+                pa.field("b", pa.bool_()),
+                pa.field("d", pa.date32()),
+                pa.field("c", pa.dictionary(pa.int8(), pa.string())),
+                pa.field("p", pa.timestamp("ns", tz="UTC")),
+            ]
+        ),
+    )
+    sanitized = sanitize_arrow_table(pa_table)
+    values = sanitized.to_pylist()
+
+    assert values[0] == {
+        "s": "a",
+        "f": 0.0,
+        "i": 0,
+        "b": True,
+        "d": "2012-01-01T00:00:00",
+        "c": "a",
+        "p": "2012-01-01T00:00:00.000000000",
+    }
+
+    # Make sure we can serialize to JSON without error
+    json.dumps(values)
+
+
 def test_sanitize_dataframe_colnames():
     df = pd.DataFrame(np.arange(12).reshape(4, 3))