Split out arrow-ipc (#3022)

* Split out arrow-ipc * RAT * Fix doc * Tweak required-features * Clippy * Fix feature flags
apache · Nov 6, 2022 · deb6455 · deb6455
1 parent 108e7d2
commit deb6455
Show file tree

Hide file tree

Showing 30 changed files with 648 additions and 592 deletions.
diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml
@@ -33,6 +33,7 @@ on:
       - arrow-schema/**
       - arrow-select/**
       - arrow-integration-test/**
+      - arrow-ipc/**
       - .github/**
 
 jobs:
@@ -61,6 +62,8 @@ jobs:
         run: cargo test -p arrow-select --all-features
       - name: Test arrow-cast with all features
         run: cargo test -p arrow-cast --all-features
+      - name: Test arrow-ipc with all features
+        run: cargo test -p arrow-ipc --all-features
       - name: Test arrow-integration-test with all features
         run: cargo test -p arrow-integration-test --all-features
       - name: Test arrow with default features
@@ -169,5 +172,7 @@ jobs:
         run: cargo clippy -p arrow-select --all-targets --all-features -- -D warnings
       - name: Clippy arrow-cast with all features
         run: cargo clippy -p arrow-cast --all-targets --all-features -- -D warnings
+      - name: Clippy arrow-ipc with all features
+        run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings
       - name: Clippy arrow
         run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings
diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml
@@ -35,6 +35,7 @@ on:
       - arrow-schema/**
       - arrow-select/**
       - arrow-flight/**
+      - arrow-ipc/**
       - .github/**
 
 jobs:

diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml
@@ -23,6 +23,7 @@ arrow:
   - arrow-data/**/*
   - arrow-schema/**/*
   - arrow-select/**/*
+  - arrow-ipc/**/*
 
 arrow-flight:
   - arrow-flight/**/*

diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -31,6 +31,7 @@ on:
       - arrow-data/**
       - arrow-schema/**
       - arrow-select/**
+      - arrow-ipc/**
       - arrow-pyarrow-integration-testing/**
       - arrow-integration-test/**
       - arrow-integration-testing/**

diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml
@@ -31,6 +31,7 @@ on:
       - arrow-data/**
       - arrow-schema/**
       - arrow-select/**
+      - arrow-ipc/**
       - .github/**
 
 jobs:

diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
@@ -34,6 +34,7 @@ on:
       - arrow-data/**
       - arrow-schema/**
       - arrow-select/**
+      - arrow-ipc/**
       - parquet/**
       - .github/**
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -25,6 +25,7 @@ members = [
         "arrow-flight",
         "arrow-integration-test",
         "arrow-integration-testing",
+        "arrow-ipc",
         "arrow-schema",
         "arrow-select",
         "parquet",

diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs
@@ -162,7 +162,7 @@ pub mod array;
 pub use array::*;
 
 mod record_batch;
-pub use record_batch::{RecordBatch, RecordBatchOptions};
+pub use record_batch::{RecordBatch, RecordBatchOptions, RecordBatchReader};
 
 mod arithmetic;
 pub use arithmetic::ArrowNativeTypeOp;

diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs
@@ -22,6 +22,24 @@ use crate::{new_empty_array, Array, ArrayRef, StructArray};
 use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef};
 use std::sync::Arc;
 
+/// Trait for types that can read `RecordBatch`'s.
+pub trait RecordBatchReader: Iterator<Item = Result<RecordBatch, ArrowError>> {
+    /// Returns the schema of this `RecordBatchReader`.
+    ///
+    /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this
+    /// reader should have the same schema as returned from this method.
+    fn schema(&self) -> SchemaRef;
+
+    /// Reads the next `RecordBatch`.
+    #[deprecated(
+        since = "2.0.0",
+        note = "This method is deprecated in favour of `next` from the trait Iterator."
+    )]
+    fn next_batch(&mut self) -> Result<Option<RecordBatch>, ArrowError> {
+        self.next().transpose()
+    }
+}
+
 /// A two-dimensional batch of column-oriented data with a defined
 /// [schema](arrow_schema::Schema).
 ///

diff --git a/arrow-ipc/CONTRIBUTING.md b/arrow-ipc/CONTRIBUTING.md
@@ -0,0 +1,37 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+## Developer's guide
+
+# IPC
+
+The expected flatc version is 1.12.0+, built from [flatbuffers](https://github.com/google/flatbuffers)
+master at fixed commit ID, by regen.sh.
+
+The IPC flatbuffer code was generated by running this command from the root of the project:
+
+```bash
+./regen.sh
+```
+
+The above script will run the `flatc` compiler and perform some adjustments to the source code:
+
+- Replace `type__` with `type_`
+- Remove `org::apache::arrow::flatbuffers` namespace
+- Add includes to each generated file
diff --git a/arrow-ipc/Cargo.toml b/arrow-ipc/Cargo.toml
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "arrow-ipc"
+version = "26.0.0"
+description = "Support for the Arrow IPC format"
+homepage = "https://github.com/apache/arrow-rs"
+repository = "https://github.com/apache/arrow-rs"
+authors = ["Apache Arrow <dev@arrow.apache.org>"]
+license = "Apache-2.0"
+keywords = ["arrow"]
+include = [
+    "benches/*.rs",
+    "src/**/*.rs",
+    "Cargo.toml",
+]
+edition = "2021"
+rust-version = "1.62"
+
+[lib]
+name = "arrow_ipc"
+path = "src/lib.rs"
+bench = false
+
+[dependencies]
+arrow-array = { version = "26.0.0", path = "../arrow-array" }
+arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" }
+arrow-cast = { version = "26.0.0", path = "../arrow-cast" }
+arrow-data = { version = "26.0.0", path = "../arrow-data" }
+arrow-schema = { version = "26.0.0", path = "../arrow-schema" }
+flatbuffers = { version = "22.9.2", default-features = false, features = ["thiserror"] }
+lz4 = { version = "1.23", default-features = false, optional = true }
+zstd = { version = "0.11.1", default-features = false, optional = true }
+
+[dev-dependencies]
+tempfile = "3.3"
diff --git a/arrow/regen.sh → arrow-ipc/regen.sh b/arrow/regen.sh → arrow-ipc/regen.sh
diff --git a/arrow/src/ipc/compression/codec.rs → arrow-ipc/src/compression.rs b/arrow/src/ipc/compression/codec.rs → arrow-ipc/src/compression.rs
@@ -15,10 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::buffer::Buffer;
-use crate::error::{ArrowError, Result};
-use crate::ipc::CompressionType;
-use std::io::{Read, Write};
+use crate::CompressionType;
+use arrow_buffer::Buffer;
+use arrow_schema::ArrowError;
 
 const LENGTH_NO_COMPRESSED_DATA: i64 = -1;
 const LENGTH_OF_PREFIX_DATA: i64 = 8;
@@ -33,7 +32,7 @@ pub enum CompressionCodec {
 impl TryFrom<CompressionType> for CompressionCodec {
     type Error = ArrowError;
 
-    fn try_from(compression_type: CompressionType) -> Result<Self> {
+    fn try_from(compression_type: CompressionType) -> Result<Self, ArrowError> {
         match compression_type {
             CompressionType::ZSTD => Ok(CompressionCodec::Zstd),
             CompressionType::LZ4_FRAME => Ok(CompressionCodec::Lz4Frame),
@@ -60,7 +59,7 @@ impl CompressionCodec {
         &self,
         input: &[u8],
         output: &mut Vec<u8>,
-    ) -> Result<usize> {
+    ) -> Result<usize, ArrowError> {
         let uncompressed_data_len = input.len();
         let original_output_len = output.len();
 
@@ -92,7 +91,10 @@ impl CompressionCodec {
     /// [8 bytes]:         uncompressed length
     /// [remaining bytes]: compressed data stream
     /// ```
-    pub(crate) fn decompress_to_buffer(&self, input: &Buffer) -> Result<Buffer> {
+    pub(crate) fn decompress_to_buffer(
+        &self,
+        input: &Buffer,
+    ) -> Result<Buffer, ArrowError> {
         // read the first 8 bytes to determine if the data is
         // compressed
         let decompressed_length = read_uncompressed_size(input);
@@ -115,50 +117,89 @@ impl CompressionCodec {
 
     /// Compress the data in input buffer and write to output buffer
     /// using the specified compression
-    fn compress(&self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
+    fn compress(&self, input: &[u8], output: &mut Vec<u8>) -> Result<(), ArrowError> {
         match self {
-            CompressionCodec::Lz4Frame => {
-                let mut encoder = lz4::EncoderBuilder::new().build(output)?;
-                encoder.write_all(input)?;
-                match encoder.finish().1 {
-                    Ok(_) => Ok(()),
-                    Err(e) => Err(e.into()),
-                }
-            }
-            CompressionCodec::Zstd => {
-                let mut encoder = zstd::Encoder::new(output, 0)?;
-                encoder.write_all(input)?;
-                match encoder.finish() {
-                    Ok(_) => Ok(()),
-                    Err(e) => Err(e.into()),
-                }
-            }
+            CompressionCodec::Lz4Frame => compress_lz4(input, output),
+            CompressionCodec::Zstd => compress_zstd(input, output),
         }
     }
 
     /// Decompress the data in input buffer and write to output buffer
     /// using the specified compression
-    fn decompress(&self, input: &[u8], output: &mut Vec<u8>) -> Result<usize> {
-        let result: Result<usize> = match self {
-            CompressionCodec::Lz4Frame => {
-                let mut decoder = lz4::Decoder::new(input)?;
-                match decoder.read_to_end(output) {
-                    Ok(size) => Ok(size),
-                    Err(e) => Err(e.into()),
-                }
-            }
-            CompressionCodec::Zstd => {
-                let mut decoder = zstd::Decoder::new(input)?;
-                match decoder.read_to_end(output) {
-                    Ok(size) => Ok(size),
-                    Err(e) => Err(e.into()),
-                }
-            }
-        };
-        result
+    fn decompress(
+        &self,
+        input: &[u8],
+        output: &mut Vec<u8>,
+    ) -> Result<usize, ArrowError> {
+        match self {
+            CompressionCodec::Lz4Frame => decompress_lz4(input, output),
+            CompressionCodec::Zstd => decompress_zstd(input, output),
+        }
     }
 }
 
+#[cfg(feature = "lz4")]
+fn compress_lz4(input: &[u8], output: &mut Vec<u8>) -> Result<(), ArrowError> {
+    use std::io::Write;
+    let mut encoder = lz4::EncoderBuilder::new().build(output)?;
+    encoder.write_all(input)?;
+    encoder.finish().1?;
+    Ok(())
+}
+
+#[cfg(not(feature = "lz4"))]
+#[allow(clippy::ptr_arg)]
+fn compress_lz4(_input: &[u8], _output: &mut Vec<u8>) -> Result<(), ArrowError> {
+    Err(ArrowError::InvalidArgumentError(
+        "lz4 IPC compression requires the lz4 feature".to_string(),
+    ))
+}
+
+#[cfg(feature = "lz4")]
+fn decompress_lz4(input: &[u8], output: &mut Vec<u8>) -> Result<usize, ArrowError> {
+    use std::io::Read;
+    Ok(lz4::Decoder::new(input)?.read_to_end(output)?)
+}
+
+#[cfg(not(feature = "lz4"))]
+#[allow(clippy::ptr_arg)]
+fn decompress_lz4(_input: &[u8], _output: &mut Vec<u8>) -> Result<usize, ArrowError> {
+    Err(ArrowError::InvalidArgumentError(
+        "lz4 IPC decompression requires the lz4 feature".to_string(),
+    ))
+}
+
+#[cfg(feature = "zstd")]
+fn compress_zstd(input: &[u8], output: &mut Vec<u8>) -> Result<(), ArrowError> {
+    use std::io::Write;
+    let mut encoder = zstd::Encoder::new(output, 0)?;
+    encoder.write_all(input)?;
+    encoder.finish()?;
+    Ok(())
+}
+
+#[cfg(not(feature = "zstd"))]
+#[allow(clippy::ptr_arg)]
+fn compress_zstd(_input: &[u8], _output: &mut Vec<u8>) -> Result<(), ArrowError> {
+    Err(ArrowError::InvalidArgumentError(
+        "zstd IPC compression requires the zstd feature".to_string(),
+    ))
+}
+
+#[cfg(feature = "zstd")]
+fn decompress_zstd(input: &[u8], output: &mut Vec<u8>) -> Result<usize, ArrowError> {
+    use std::io::Read;
+    Ok(zstd::Decoder::new(input)?.read_to_end(output)?)
+}
+
+#[cfg(not(feature = "zstd"))]
+#[allow(clippy::ptr_arg)]
+fn decompress_zstd(_input: &[u8], _output: &mut Vec<u8>) -> Result<usize, ArrowError> {
+    Err(ArrowError::InvalidArgumentError(
+        "zstd IPC decompression requires the zstd feature".to_string(),
+    ))
+}
+
 /// Get the uncompressed length
 /// Notes:
 ///   LENGTH_NO_COMPRESSED_DATA: indicate that the data that follows is not compressed
@@ -173,12 +214,11 @@ fn read_uncompressed_size(buffer: &[u8]) -> i64 {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
-
     #[test]
+    #[cfg(feature = "lz4")]
     fn test_lz4_compression() {
         let input_bytes = "hello lz4".as_bytes();
-        let codec: CompressionCodec = CompressionCodec::Lz4Frame;
+        let codec = super::CompressionCodec::Lz4Frame;
         let mut output_bytes: Vec<u8> = Vec::new();
         codec.compress(input_bytes, &mut output_bytes).unwrap();
         let mut result_output_bytes: Vec<u8> = Vec::new();
@@ -189,9 +229,10 @@ mod tests {
     }
 
     #[test]
+    #[cfg(feature = "zstd")]
     fn test_zstd_compression() {
         let input_bytes = "hello zstd".as_bytes();
-        let codec: CompressionCodec = CompressionCodec::Zstd;
+        let codec = super::CompressionCodec::Zstd;
         let mut output_bytes: Vec<u8> = Vec::new();
         codec.compress(input_bytes, &mut output_bytes).unwrap();
         let mut result_output_bytes: Vec<u8> = Vec::new();