Skip to content

Commit

Permalink
Split out arrow-row (#2594) (#3375)
Browse files Browse the repository at this point in the history
* Split out arrow-row (#2594)

* Fix CI

* Fix doc

* More SortOptions to arrow_schema
  • Loading branch information
tustvold committed Dec 21, 2022
1 parent 0e4ddbf commit db9084e
Show file tree
Hide file tree
Showing 15 changed files with 134 additions and 76 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/arrow.yml
Expand Up @@ -35,6 +35,7 @@ on:
- arrow-ipc/**
- arrow-json/**
- arrow-ord/**
- arrow-row/**
- arrow-schema/**
- arrow-select/**
- arrow-string/**
Expand Down Expand Up @@ -76,6 +77,8 @@ jobs:
run: cargo test -p arrow-string --all-features
- name: Test arrow-ord with all features except SIMD
run: cargo test -p arrow-ord --features dyn_cmp_dict
- name: Test arrow-row with all features
run: cargo test -p arrow-row --all-features
- name: Test arrow-integration-test with all features
run: cargo test -p arrow-integration-test --all-features
- name: Test arrow with default features
Expand Down Expand Up @@ -196,5 +199,7 @@ jobs:
run: cargo clippy -p arrow-string --all-targets --all-features -- -D warnings
- name: Clippy arrow-ord with all features except SIMD
run: cargo clippy -p arrow-ord --all-targets --features dyn_cmp_dict -- -D warnings
- name: Clippy arrow-row with all features
run: cargo clippy -p arrow-row --all-targets --all-features -- -D warnings
- name: Clippy arrow
run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings
3 changes: 2 additions & 1 deletion .github/workflows/integration.yml
Expand Up @@ -33,11 +33,12 @@ on:
- arrow-integration-test/**
- arrow-integration-testing/**
- arrow-ipc/**
- arrow-ord/**
- arrow-json/**
- arrow-ord/**
- arrow-pyarrow-integration-testing/**
- arrow-schema/**
- arrow-select/**
- arrow-sort/**
- arrow-string/**
- arrow/**

Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Expand Up @@ -29,6 +29,7 @@ members = [
"arrow-ipc",
"arrow-json",
"arrow-ord",
"arrow-row",
"arrow-schema",
"arrow-select",
"arrow-string",
Expand Down
21 changes: 2 additions & 19 deletions arrow-ord/src/sort.rs
Expand Up @@ -27,6 +27,8 @@ use arrow_schema::{ArrowError, DataType, IntervalUnit, TimeUnit};
use arrow_select::take::take;
use std::cmp::Ordering;

pub use arrow_schema::SortOptions;

/// Sort the `ArrayRef` using `SortOptions`.
///
/// Performs a sort on values and indices. Nulls are ordered according
Expand Down Expand Up @@ -366,25 +368,6 @@ pub fn sort_to_indices(
})
}

/// Options that define how sort kernels should behave
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct SortOptions {
/// Whether to sort in descending order
pub descending: bool,
/// Whether to sort nulls first
pub nulls_first: bool,
}

impl Default for SortOptions {
fn default() -> Self {
Self {
descending: false,
// default to nulls first to match spark's behavior
nulls_first: true,
}
}
}

/// Sort boolean values
///
/// when a limit is present, the sort is pair-comparison based as k-select might be more efficient,
Expand Down
61 changes: 61 additions & 0 deletions arrow-row/Cargo.toml
@@ -0,0 +1,61 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[package]
name = "arrow-row"
version = "29.0.0"
description = "Arrow row format"
homepage = "https://github.com/apache/arrow-rs"
repository = "https://github.com/apache/arrow-rs"
authors = ["Apache Arrow <dev@arrow.apache.org>"]
license = "Apache-2.0"
keywords = ["arrow"]
include = [
"benches/*.rs",
"src/**/*.rs",
"Cargo.toml",
]
edition = "2021"
rust-version = "1.62"

[lib]
name = "arrow_row"
path = "src/lib.rs"
bench = false

[target.'cfg(target_arch = "wasm32")'.dependencies]
ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] }

[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }

[dependencies]
arrow-array = { version = "29.0.0", path = "../arrow-array" }
arrow-buffer = { version = "29.0.0", path = "../arrow-buffer" }
arrow-data = { version = "29.0.0", path = "../arrow-data" }
arrow-schema = { version = "29.0.0", path = "../arrow-schema" }

half = { version = "2.1", default-features = false }
hashbrown = { version = "0.13", default-features = false }

[dev-dependencies]
arrow-cast = { version = "29.0.0", path = "../arrow-cast" }
arrow-ord = { version = "29.0.0", path = "../arrow-ord" }
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }

[features]

9 changes: 4 additions & 5 deletions arrow/src/row/dictionary.rs → arrow-row/src/dictionary.rs
Expand Up @@ -15,17 +15,16 @@
// specific language governing permissions and limitations
// under the License.

use crate::compute::SortOptions;
use crate::row::fixed::{FixedLengthEncoding, FromSlice};
use crate::row::interner::{Interned, OrderPreservingInterner};
use crate::row::{null_sentinel, Rows};
use crate::fixed::{FixedLengthEncoding, FromSlice};
use crate::interner::{Interned, OrderPreservingInterner};
use crate::{null_sentinel, Rows};
use arrow_array::builder::*;
use arrow_array::cast::*;
use arrow_array::types::*;
use arrow_array::*;
use arrow_buffer::{ArrowNativeType, MutableBuffer, ToByteSlice};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType};
use arrow_schema::{ArrowError, DataType, SortOptions};
use std::collections::hash_map::Entry;
use std::collections::HashMap;

Expand Down
8 changes: 3 additions & 5 deletions arrow/src/row/fixed.rs → arrow-row/src/fixed.rs
Expand Up @@ -16,14 +16,12 @@
// under the License.

use crate::array::PrimitiveArray;
use crate::compute::SortOptions;
use crate::datatypes::ArrowPrimitiveType;
use crate::row::{null_sentinel, Rows};
use crate::{null_sentinel, Rows};
use arrow_array::builder::BufferBuilder;
use arrow_array::{BooleanArray, FixedSizeBinaryArray};
use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray};
use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
use arrow_schema::{DataType, SortOptions};
use half::f16;

pub trait FromSlice {
Expand Down
File renamed without changes.
62 changes: 27 additions & 35 deletions arrow/src/row/mod.rs → arrow-row/src/lib.rs
Expand Up @@ -50,7 +50,7 @@
//! # Basic Example
//! ```
//! # use std::sync::Arc;
//! # use arrow::row::{RowConverter, SortField};
//! # use arrow_row::{RowConverter, SortField};
//! # use arrow_array::{ArrayRef, Int32Array, StringArray};
//! # use arrow_array::cast::{as_primitive_array, as_string_array};
//! # use arrow_array::types::Int32Type;
Expand Down Expand Up @@ -102,7 +102,7 @@
//! The row format can also be used to implement a fast multi-column / lexicographic sort
//!
//! ```
//! # use arrow::row::{RowConverter, SortField};
//! # use arrow_row::{RowConverter, SortField};
//! # use arrow_array::{ArrayRef, UInt32Array};
//! fn lexsort_to_indices(arrays: &[ArrayRef]) -> UInt32Array {
//! let fields = arrays
Expand All @@ -117,11 +117,11 @@
//! }
//! ```
//!
//! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts]
//! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort]
//! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf]
//! [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html]
//! [`lexsort`]: crate::compute::kernels::sort::lexsort
//! [non-comparison sorts]: https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts
//! [radix sort]: https://en.wikipedia.org/wiki/Radix_sort
//! [normalized for sorting]: https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf
//! [`memcmp`]: https://www.man7.org/linux/man-pages/man3/memcmp.3.html
//! [`lexsort`]: https://docs.rs/arrow-ord/latest/arrow_ord/sort/fn.lexsort.html
//! [compared]: PartialOrd
//! [compare]: PartialOrd

Expand All @@ -131,18 +131,16 @@ use std::sync::Arc;

use arrow_array::cast::*;
use arrow_array::*;
use arrow_buffer::ArrowNativeType;
use arrow_data::ArrayDataBuilder;
use arrow_schema::*;

use crate::compute::SortOptions;
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::row::dictionary::{
use crate::dictionary::{
compute_dictionary_mapping, decode_dictionary, encode_dictionary,
};
use crate::row::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
use crate::row::interner::OrderPreservingInterner;
use crate::row::variable::{decode_binary, decode_string};
use crate::{downcast_dictionary_array, downcast_primitive_array};
use crate::fixed::{decode_bool, decode_fixed_size_binary, decode_primitive};
use crate::interner::OrderPreservingInterner;
use crate::variable::{decode_binary, decode_string};

mod dictionary;
mod fixed;
Expand Down Expand Up @@ -437,7 +435,7 @@ enum Codec {
}

impl Codec {
fn new(sort_field: &SortField) -> Result<Self> {
fn new(sort_field: &SortField) -> Result<Self, ArrowError> {
match &sort_field.data_type {
DataType::Dictionary(_, _) => Ok(Self::Dictionary(Default::default())),
d if !d.is_nested() => Ok(Self::Stateless),
Expand Down Expand Up @@ -485,7 +483,7 @@ impl Codec {
}
}

fn encoder(&mut self, array: &dyn Array) -> Result<Encoder<'_>> {
fn encoder(&mut self, array: &dyn Array) -> Result<Encoder<'_>, ArrowError> {
match self {
Codec::Stateless => Ok(Encoder::Stateless),
Codec::Dictionary(interner) => {
Expand Down Expand Up @@ -577,15 +575,15 @@ impl SortField {

impl RowConverter {
/// Create a new [`RowConverter`] with the provided schema
pub fn new(fields: Vec<SortField>) -> Result<Self> {
pub fn new(fields: Vec<SortField>) -> Result<Self, ArrowError> {
if !Self::supports_fields(&fields) {
return Err(ArrowError::NotYetImplemented(format!(
"Row format support not yet implemented for: {:?}",
fields
)));
}

let codecs = fields.iter().map(Codec::new).collect::<Result<_>>()?;
let codecs = fields.iter().map(Codec::new).collect::<Result<_, _>>()?;
Ok(Self {
fields: fields.into(),
codecs,
Expand Down Expand Up @@ -617,7 +615,7 @@ impl RowConverter {
/// # Panics
///
/// Panics if the schema of `columns` does not match that provided to [`RowConverter::new`]
pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result<Rows> {
pub fn convert_columns(&mut self, columns: &[ArrayRef]) -> Result<Rows, ArrowError> {
if columns.len() != self.fields.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Incorrect number of arrays provided to RowConverter, expected {} got {}",
Expand All @@ -640,7 +638,7 @@ impl RowConverter {
}
codec.encoder(column.as_ref())
})
.collect::<Result<Vec<_>>>()?;
.collect::<Result<Vec<_>, _>>()?;

let config = RowConfig {
fields: Arc::clone(&self.fields),
Expand Down Expand Up @@ -671,7 +669,7 @@ impl RowConverter {
/// # Panics
///
/// Panics if the rows were not produced by this [`RowConverter`]
pub fn convert_rows<'a, I>(&self, rows: I) -> Result<Vec<ArrayRef>>
pub fn convert_rows<'a, I>(&self, rows: I) -> Result<Vec<ArrayRef>, ArrowError>
where
I: IntoIterator<Item = Row<'a>>,
{
Expand Down Expand Up @@ -703,7 +701,7 @@ impl RowConverter {
&self,
rows: &mut [&[u8]],
validate_utf8: bool,
) -> Result<Vec<ArrayRef>> {
) -> Result<Vec<ArrayRef>, ArrowError> {
self.fields
.iter()
.zip(&self.codecs)
Expand Down Expand Up @@ -1196,7 +1194,7 @@ unsafe fn decode_column(
rows: &mut [&[u8]],
codec: &Codec,
validate_utf8: bool,
) -> Result<ArrayRef> {
) -> Result<ArrayRef, ArrowError> {
let options = field.options;

let array: ArrayRef = match codec {
Expand Down Expand Up @@ -1255,24 +1253,18 @@ unsafe fn decode_column(
mod tests {
use std::sync::Arc;

use arrow_array::builder::{
FixedSizeBinaryBuilder, GenericListBuilder, Int32Builder,
};
use rand::distributions::uniform::SampleUniform;
use rand::distributions::{Distribution, Standard};
use rand::{thread_rng, Rng};

use arrow_array::NullArray;
use arrow_array::builder::*;
use arrow_array::types::*;
use arrow_array::*;
use arrow_buffer::i256;
use arrow_buffer::Buffer;
use arrow_cast::display::array_value_to_string;
use arrow_ord::sort::{LexicographicalComparator, SortColumn, SortOptions};

use crate::array::{
BinaryArray, BooleanArray, DictionaryArray, Float32Array, GenericStringArray,
Int16Array, Int32Array, OffsetSizeTrait, PrimitiveArray,
PrimitiveDictionaryBuilder, StringArray,
};
use crate::util::display::array_value_to_string;

use super::*;

#[test]
Expand Down
5 changes: 2 additions & 3 deletions arrow/src/row/list.rs → arrow-row/src/list.rs
Expand Up @@ -15,12 +15,11 @@
// specific language governing permissions and limitations
// under the License.

use crate::compute::SortOptions;
use crate::row::{RowConverter, Rows, SortField};
use crate::{RowConverter, Rows, SortField};
use arrow_array::builder::BufferBuilder;
use arrow_array::{Array, GenericListArray, OffsetSizeTrait};
use arrow_data::ArrayDataBuilder;
use arrow_schema::ArrowError;
use arrow_schema::{ArrowError, SortOptions};
use std::ops::Range;

pub fn compute_lengths<O: OffsetSizeTrait>(
Expand Down
9 changes: 4 additions & 5 deletions arrow/src/row/variable.rs → arrow-row/src/variable.rs
Expand Up @@ -15,14 +15,13 @@
// specific language governing permissions and limitations
// under the License.

use crate::compute::SortOptions;
use crate::row::{null_sentinel, Rows};
use crate::util::bit_util::ceil;
use crate::{null_sentinel, Rows};
use arrow_array::builder::BufferBuilder;
use arrow_array::{Array, GenericBinaryArray, GenericStringArray, OffsetSizeTrait};
use arrow_array::*;
use arrow_buffer::bit_util::ceil;
use arrow_buffer::MutableBuffer;
use arrow_data::ArrayDataBuilder;
use arrow_schema::DataType;
use arrow_schema::{DataType, SortOptions};

/// The block size of the variable length encoding
pub const BLOCK_SIZE: usize = 32;
Expand Down

0 comments on commit db9084e

Please sign in to comment.