apache · tustvold · Sep 20, 2022 · Sep 20, 2022 · Sep 20, 2022 · Oct 7, 2022
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -52,3 +52,14 @@ name = "datafusion._internal"
 [profile.release]
 lto = true
 codegen-units = 1
+
+
+[patch.crates-io]
+arrow = { git = "https://github.com/tustvold/arrow-rs.git", rev = "6f62bb62f630cbd910ae5b1b04f97688af7c1b42" }
+parquet = { git = "https://github.com/tustvold/arrow-rs.git", rev = "6f62bb62f630cbd910ae5b1b04f97688af7c1b42" }
+arrow-buffer = { git = "https://github.com/tustvold/arrow-rs.git", rev = "6f62bb62f630cbd910ae5b1b04f97688af7c1b42" }
+arrow-schema = { git = "https://github.com/tustvold/arrow-rs.git", rev = "6f62bb62f630cbd910ae5b1b04f97688af7c1b42" }
+
+datafusion = { git = "https://github.com/tustvold/arrow-datafusion.git", rev = "9de354bf45c0cc4121af04ea8138df7fddab76ed" }
+datafusion-expr = { git = "https://github.com/tustvold/arrow-datafusion.git", rev = "9de354bf45c0cc4121af04ea8138df7fddab76ed" }
+datafusion-common = { git = "https://github.com/tustvold/arrow-datafusion.git", rev = "9de354bf45c0cc4121af04ea8138df7fddab76ed"}
diff --git a/src/context.rs b/src/context.rs
@@ -24,6 +24,7 @@ use pyo3::exceptions::{PyKeyError, PyValueError};
 use pyo3::prelude::*;
 
 use datafusion::arrow::datatypes::Schema;
+use datafusion::arrow::pyarrow::PyArrowType;
 use datafusion::arrow::record_batch::RecordBatch;
 use datafusion::datasource::datasource::TableProvider;
 use datafusion::datasource::MemTable;
@@ -99,9 +100,16 @@ impl PySessionContext {
         Ok(PyDataFrame::new(df))
     }
 
-    fn create_dataframe(&mut self, partitions: Vec<Vec<RecordBatch>>) -> PyResult<PyDataFrame> {
-        let table = MemTable::try_new(partitions[0][0].schema(), partitions)
-            .map_err(DataFusionError::from)?;
+    fn create_dataframe(
+        &mut self,
+        partitions: Vec<Vec<PyArrowType<RecordBatch>>>,
+    ) -> PyResult<PyDataFrame> {
+        let schema = partitions[0][0].0.schema();
+        let partitions = partitions
+            .into_iter()
+            .map(|x| x.into_iter().map(|x| x.0).collect())
+            .collect();
+        let table = MemTable::try_new(schema, partitions).map_err(DataFusionError::from)?;
 
         // generate a random (unique) name for this table
         // table name cannot start with numeric digit
@@ -136,9 +144,13 @@ impl PySessionContext {
     fn register_record_batches(
         &mut self,
         name: &str,
-        partitions: Vec<Vec<RecordBatch>>,
+        partitions: Vec<Vec<PyArrowType<RecordBatch>>>,
     ) -> PyResult<()> {
-        let schema = partitions[0][0].schema();
+        let schema = partitions[0][0].0.schema();
+        let partitions = partitions
+            .into_iter()
+            .map(|x| x.into_iter().map(|x| x.0).collect())
+            .collect();
         let table = MemTable::try_new(schema, partitions)?;
         self.ctx
             .register_table(name, Arc::new(table))
@@ -182,7 +194,7 @@ impl PySessionContext {
         &mut self,
         name: &str,
         path: PathBuf,
-        schema: Option<Schema>,
+        schema: Option<PyArrowType<Schema>>,
         has_header: bool,
         delimiter: &str,
         schema_infer_max_records: usize,
@@ -204,7 +216,7 @@ impl PySessionContext {
             .delimiter(delimiter[0])
             .schema_infer_max_records(schema_infer_max_records)
             .file_extension(file_extension);
-        options.schema = schema.as_ref();
+        options.schema = schema.as_ref().map(|x| &x.0);
 
         let result = self.ctx.register_csv(name, path, options);
         wait_for_future(py, result).map_err(DataFusionError::from)?;

diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -18,7 +18,7 @@
 use crate::utils::wait_for_future;
 use crate::{errors::DataFusionError, expression::PyExpr};
 use datafusion::arrow::datatypes::Schema;
-use datafusion::arrow::pyarrow::PyArrowConvert;
+use datafusion::arrow::pyarrow::{PyArrowConvert, PyArrowException, PyArrowType};
 use datafusion::arrow::util::pretty;
 use datafusion::dataframe::DataFrame;
 use datafusion::prelude::*;
@@ -65,8 +65,8 @@ impl PyDataFrame {
     }
 
     /// Returns the schema from the logical plan
-    fn schema(&self) -> Schema {
-        self.df.schema().into()
+    fn schema(&self) -> PyArrowType<Schema> {
+        PyArrowType(self.df.schema().into())
     }
 
     #[args(args = "*")]
@@ -144,7 +144,8 @@ impl PyDataFrame {
     fn show(&self, py: Python, num: usize) -> PyResult<()> {
         let df = self.df.limit(0, Some(num))?;
         let batches = wait_for_future(py, df.collect())?;
-        Ok(pretty::print_batches(&batches)?)
+        Ok(pretty::print_batches(&batches)
+            .map_err(|err| PyArrowException::new_err(err.to_string()))?)
     }
 
     /// Filter out duplicate rows
@@ -186,7 +187,8 @@ impl PyDataFrame {
     fn explain(&self, py: Python, verbose: bool, analyze: bool) -> PyResult<()> {
         let df = self.df.explain(verbose, analyze)?;
         let batches = wait_for_future(py, df.collect())?;
-        Ok(pretty::print_batches(&batches)?)
+        Ok(pretty::print_batches(&batches)
+            .map_err(|err| PyArrowException::new_err(err.to_string()))?)
     }
 
     /// Repartition a `DataFrame` based on a logical partitioning scheme.

diff --git a/src/dataset.rs b/src/dataset.rs
@@ -27,6 +27,7 @@ use std::sync::Arc;
 use async_trait::async_trait;
 
 use datafusion::arrow::datatypes::SchemaRef;
+use datafusion::arrow::pyarrow::PyArrowType;
 use datafusion::datasource::datasource::TableProviderFilterPushDown;
 use datafusion::datasource::{TableProvider, TableType};
 use datafusion::error::{DataFusionError, Result as DFResult};
@@ -74,7 +75,14 @@ impl TableProvider for Dataset {
         Python::with_gil(|py| {
             let dataset = self.dataset.as_ref(py);
             // This can panic but since we checked that self.dataset is a pyarrow.dataset.Dataset it should never
-            Arc::new(dataset.getattr("schema").unwrap().extract().unwrap())
+            Arc::new(
+                dataset
+                    .getattr("schema")
+                    .unwrap()
+                    .extract::<PyArrowType<_>>()
+                    .unwrap()
+                    .0,
+            )
         })
     }