Split out arrow-schema (#2594) (#2711)

* Split out arrow-schema (#2594) * Flatten schema * Move decimal logic * Fix doc * Fix tests * Fix integration-test * Remove pyarrow orphan * PyArrow fixes * Move ArrowError to arrow-schema * Fix pyarrow * Fix test * Fix conflicts * Fix pyarrow * Tweak feature flags * Test juggling * Derive PyArrowConvert for Vec
apache · Sep 21, 2022 · 48cc8be · 48cc8be
1 parent 74f639c
commit 48cc8be
Show file tree

Hide file tree

Showing 21 changed files with 1,625 additions and 1,493 deletions.
diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml
@@ -63,6 +63,8 @@ jobs:
           cargo run --example read_csv_infer_schema
       - name: Run non-archery based integration-tests
         run: cargo test -p arrow-integration-testing
+      - name: Test arrow-schema with all features
+        run: cargo test -p arrow-schema --all-features
 
   # test compilaton features
   linux-features:

diff --git a/Cargo.toml b/Cargo.toml
@@ -18,6 +18,7 @@
 [workspace]
 members = [
         "arrow",
+        "arrow-schema",
         "arrow-buffer",
         "arrow-flight",
         "parquet",

diff --git a/arrow-pyarrow-integration-testing/src/lib.rs b/arrow-pyarrow-integration-testing/src/lib.rs
@@ -28,9 +28,13 @@ use arrow::compute::kernels;
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::error::ArrowError;
 use arrow::ffi_stream::ArrowArrayStreamReader;
-use arrow::pyarrow::PyArrowConvert;
+use arrow::pyarrow::{PyArrowConvert, PyArrowException, PyArrowType};
 use arrow::record_batch::RecordBatch;
 
+fn to_py_err(err: ArrowError) -> PyErr {
+    PyArrowException::new_err(err.to_string())
+}
+
 /// Returns `array + array` of an int64 array.
 #[pyfunction]
 fn double(array: &PyAny, py: Python) -> PyResult<PyObject> {
@@ -41,8 +45,10 @@ fn double(array: &PyAny, py: Python) -> PyResult<PyObject> {
     let array = array
         .as_any()
         .downcast_ref::<Int64Array>()
-        .ok_or(ArrowError::ParseError("Expects an int64".to_string()))?;
-    let array = kernels::arithmetic::add(array, array)?;
+        .ok_or_else(|| ArrowError::ParseError("Expects an int64".to_string()))
+        .map_err(to_py_err)?;
+
+    let array = kernels::arithmetic::add(array, array).map_err(to_py_err)?;
 
     // export
     array.to_pyarrow(py)
@@ -66,56 +72,61 @@ fn double_py(lambda: &PyAny, py: Python) -> PyResult<bool> {
 
 /// Returns the substring
 #[pyfunction]
-fn substring(array: ArrayData, start: i64) -> PyResult<ArrayData> {
+fn substring(
+    array: PyArrowType<ArrayData>,
+    start: i64,
+) -> PyResult<PyArrowType<ArrayData>> {
     // import
-    let array = ArrayRef::from(array);
+    let array = ArrayRef::from(array.0);
 
     // substring
-    let array = kernels::substring::substring(array.as_ref(), start, None)?;
+    let array = kernels::substring::substring(array.as_ref(), start, None).map_err(to_py_err)?;
 
-    Ok(array.data().to_owned())
+    Ok(array.data().to_owned().into())
 }
 
 /// Returns the concatenate
 #[pyfunction]
-fn concatenate(array: ArrayData, py: Python) -> PyResult<PyObject> {
-    let array = ArrayRef::from(array);
+fn concatenate(array: PyArrowType<ArrayData>, py: Python) -> PyResult<PyObject> {
+    let array = ArrayRef::from(array.0);
 
     // concat
-    let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()])?;
+    let array = kernels::concat::concat(&[array.as_ref(), array.as_ref()]).map_err(to_py_err)?;
 
     array.to_pyarrow(py)
 }
 
 #[pyfunction]
-fn round_trip_type(obj: DataType) -> PyResult<DataType> {
+fn round_trip_type(obj: PyArrowType<DataType>) -> PyResult<PyArrowType<DataType>> {
     Ok(obj)
 }
 
 #[pyfunction]
-fn round_trip_field(obj: Field) -> PyResult<Field> {
+fn round_trip_field(obj: PyArrowType<Field>) -> PyResult<PyArrowType<Field>> {
     Ok(obj)
 }
 
 #[pyfunction]
-fn round_trip_schema(obj: Schema) -> PyResult<Schema> {
+fn round_trip_schema(obj: PyArrowType<Schema>) -> PyResult<PyArrowType<Schema>> {
     Ok(obj)
 }
 
 #[pyfunction]
-fn round_trip_array(obj: ArrayData) -> PyResult<ArrayData> {
+fn round_trip_array(obj: PyArrowType<ArrayData>) -> PyResult<PyArrowType<ArrayData>> {
     Ok(obj)
 }
 
 #[pyfunction]
-fn round_trip_record_batch(obj: RecordBatch) -> PyResult<RecordBatch> {
+fn round_trip_record_batch(
+    obj: PyArrowType<RecordBatch>,
+) -> PyResult<PyArrowType<RecordBatch>> {
     Ok(obj)
 }
 
 #[pyfunction]
 fn round_trip_record_batch_reader(
-    obj: ArrowArrayStreamReader,
-) -> PyResult<ArrowArrayStreamReader> {
+    obj: PyArrowType<ArrowArrayStreamReader>,
+) -> PyResult<PyArrowType<ArrowArrayStreamReader>> {
     Ok(obj)
 }
 

diff --git a/arrow-schema/Cargo.toml b/arrow-schema/Cargo.toml
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "arrow-schema"
+version = "23.0.0"
+description = "Defines the logical types for arrow arrays"
+homepage = "https://github.com/apache/arrow-rs"
+repository = "https://github.com/apache/arrow-rs"
+authors = ["Apache Arrow <dev@arrow.apache.org>"]
+license = "Apache-2.0"
+keywords = ["arrow"]
+include = [
+    "benches/*.rs",
+    "src/**/*.rs",
+    "Cargo.toml",
+]
+edition = "2021"
+rust-version = "1.62"
+
+[lib]
+name = "arrow_schema"
+path = "src/lib.rs"
+bench = false
+
+[dependencies]
+serde = { version = "1.0", default-features = false, features = ["derive", "std"], optional = true }
+
+[features]
+default = []
+
+[dev-dependencies]
+serde_json = "1.0"