Skip to content

Commit

Permalink
Split out value selection kernels into arrow-select (#2594) (#2885)
Browse files Browse the repository at this point in the history
* Split out arrow-select (#2594)

* Fix doc

* Clippy
  • Loading branch information
tustvold committed Oct 20, 2022
1 parent 8be8209 commit 9a8b04d
Show file tree
Hide file tree
Showing 16 changed files with 353 additions and 407 deletions.
13 changes: 9 additions & 4 deletions .github/workflows/arrow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ on:
- arrow-buffer/**
- arrow-data/**
- arrow-schema/**
- arrow-select/**
- arrow-integration-test/**
- .github/**

Expand Down Expand Up @@ -61,6 +62,8 @@ jobs:
run: cargo test -p arrow-schema --all-features
- name: Test arrow-array with all features
run: cargo test -p arrow-array --all-features
- name: Test arrow-select with all features
run: cargo test -p arrow-select --all-features
- name: Test arrow-integration-test with all features
run: cargo test -p arrow-integration-test --all-features
- name: Test arrow
Expand Down Expand Up @@ -193,13 +196,15 @@ jobs:
run: |
rustup component add clippy
- name: Clippy arrow-buffer with all features
run: cargo clippy -p arrow-buffer --all-features
run: cargo clippy -p arrow-buffer --all-targets --all-features
- name: Clippy arrow-data with all features
run: cargo clippy -p arrow-data --all-features
run: cargo clippy -p arrow-data --all-targets --all-features
- name: Clippy arrow-schema with all features
run: cargo clippy -p arrow-schema --all-features
run: cargo clippy -p arrow-schema --all-targets --all-features
- name: Clippy arrow-array with all features
run: cargo clippy -p arrow-array --all-features
run: cargo clippy -p arrow-array --all-targets --all-features
- name: Clippy arrow-select with all features
run: cargo clippy -p arrow-select --all-targets --all-features
- name: Clippy arrow
run: |
cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict --all-targets -- -D warnings
1 change: 1 addition & 0 deletions .github/workflows/arrow_flight.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ on:
- arrow-buffer/**
- arrow-data/**
- arrow-schema/**
- arrow-select/**
- arrow-flight/**
- .github/**

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/dev_pr/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ arrow:
- arrow-buffer/**/*
- arrow-data/**/*
- arrow-schema/**/*
- arrow-select/**/*

arrow-flight:
- arrow-flight/**/*
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ on:
- arrow-buffer/**
- arrow-data/**
- arrow-schema/**
- arrow-select/**
- arrow-pyarrow-integration-testing/**
- arrow-integration-test/**
- arrow-integration-testing/**
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/miri.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ on:
- arrow-buffer/**
- arrow-data/**
- arrow-schema/**
- arrow-select/**
- .github/**

jobs:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/parquet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ on:
- arrow-buffer/**
- arrow-data/**
- arrow-schema/**
- arrow-select/**
- parquet/**
- .github/**

Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ members = [
"arrow-integration-test",
"arrow-integration-testing",
"arrow-schema",
"arrow-select",
"parquet",
"parquet_derive",
"parquet_derive_test",
Expand Down
51 changes: 51 additions & 0 deletions arrow-select/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[package]
name = "arrow-select"
version = "25.0.0"
description = "Selection kernels for arrow arrays"
homepage = "https://github.com/apache/arrow-rs"
repository = "https://github.com/apache/arrow-rs"
authors = ["Apache Arrow <dev@arrow.apache.org>"]
license = "Apache-2.0"
keywords = ["arrow"]
include = [
"benches/*.rs",
"src/**/*.rs",
"Cargo.toml",
]
edition = "2021"
rust-version = "1.62"

[lib]
name = "arrow_select"
path = "src/lib.rs"
bench = false

[dependencies]
arrow-buffer = { version = "25.0.0", path = "../arrow-buffer" }
arrow-data = { version = "25.0.0", path = "../arrow-data" }
arrow-schema = { version = "25.0.0", path = "../arrow-schema" }
arrow-array = { version = "25.0.0", path = "../arrow-array" }
num = { version = "0.4", default-features = false, features = ["std"] }

[features]
default = []

[dev-dependencies]
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
101 changes: 33 additions & 68 deletions arrow/src/compute/kernels/filter.rs → arrow-select/src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@
// specific language governing permissions and limitations
// under the License.

//! Defines miscellaneous array kernels.
//! Defines filter kernels

use std::ops::AddAssign;
use std::sync::Arc;

use num::Zero;

use crate::array::*;
use crate::buffer::{buffer_bin_and, Buffer, MutableBuffer};
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::record_batch::RecordBatch;
use crate::util::bit_iterator::{BitIndexIterator, BitSliceIterator};
use crate::util::bit_util;
use crate::{downcast_dictionary_array, downcast_primitive_array};
use arrow_array::builder::BooleanBufferBuilder;
use arrow_array::*;
use arrow_buffer::bit_util;
use arrow_buffer::{buffer::buffer_bin_and, Buffer, MutableBuffer};
use arrow_data::bit_iterator::{BitIndexIterator, BitSliceIterator};
use arrow_data::transform::MutableArrayData;
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::*;

/// If the filter selects more than this fraction of rows, use
/// [`SlicesIterator`] to copy ranges of values. Otherwise iterate
Expand Down Expand Up @@ -130,7 +130,7 @@ pub type Filter<'a> = Box<dyn Fn(&ArrayData) -> ArrayData + 'a>;
/// Deprecated: Use [`FilterBuilder`] instead
#[deprecated]
#[allow(deprecated)]
pub fn build_filter(filter: &BooleanArray) -> Result<Filter> {
pub fn build_filter(filter: &BooleanArray) -> Result<Filter, ArrowError> {
let iter = SlicesIterator::new(filter);
let filter_count = filter_count(filter);
let chunks = iter.collect::<Vec<_>>();
Expand Down Expand Up @@ -173,19 +173,18 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray {
///
/// # Example
/// ```rust
/// # use arrow::array::{Int32Array, BooleanArray};
/// # use arrow::error::Result;
/// # use arrow::compute::kernels::filter::filter;
/// # fn main() -> Result<()> {
/// # use arrow_array::{Int32Array, BooleanArray};
/// # use arrow_select::filter::filter;
/// let array = Int32Array::from(vec![5, 6, 7, 8, 9]);
/// let filter_array = BooleanArray::from(vec![true, false, false, true, false]);
/// let c = filter(&array, &filter_array)?;
/// let c = filter(&array, &filter_array).unwrap();
/// let c = c.as_any().downcast_ref::<Int32Array>().unwrap();
/// assert_eq!(c, &Int32Array::from(vec![5, 8]));
/// # Ok(())
/// # }
/// ```
pub fn filter(values: &dyn Array, predicate: &BooleanArray) -> Result<ArrayRef> {
pub fn filter(
values: &dyn Array,
predicate: &BooleanArray,
) -> Result<ArrayRef, ArrowError> {
let predicate = FilterBuilder::new(predicate).build();
filter_array(values, &predicate)
}
Expand All @@ -194,7 +193,7 @@ pub fn filter(values: &dyn Array, predicate: &BooleanArray) -> Result<ArrayRef>
pub fn filter_record_batch(
record_batch: &RecordBatch,
predicate: &BooleanArray,
) -> Result<RecordBatch> {
) -> Result<RecordBatch, ArrowError> {
let mut filter_builder = FilterBuilder::new(predicate);
if record_batch.num_columns() > 1 {
// Only optimize if filtering more than one column
Expand All @@ -206,7 +205,7 @@ pub fn filter_record_batch(
.columns()
.iter()
.map(|a| filter_array(a, &filter))
.collect::<Result<Vec<_>>>()?;
.collect::<Result<Vec<_>, _>>()?;

RecordBatch::try_new(record_batch.schema(), filtered_arrays)
}
Expand Down Expand Up @@ -318,12 +317,15 @@ pub struct FilterPredicate {

impl FilterPredicate {
/// Selects rows from `values` based on this [`FilterPredicate`]
pub fn filter(&self, values: &dyn Array) -> Result<ArrayRef> {
pub fn filter(&self, values: &dyn Array) -> Result<ArrayRef, ArrowError> {
filter_array(values, self)
}
}

fn filter_array(values: &dyn Array, predicate: &FilterPredicate) -> Result<ArrayRef> {
fn filter_array(
values: &dyn Array,
predicate: &FilterPredicate,
) -> Result<ArrayRef, ArrowError> {
if predicate.filter.len() > values.len() {
return Err(ArrowError::InvalidArgumentError(format!(
"Filter predicate of length {} is larger than target array of length {}",
Expand Down Expand Up @@ -683,15 +685,11 @@ where

#[cfg(test)]
mod tests {
use arrow_array::builder::*;
use arrow_array::types::*;
use rand::distributions::{Alphanumeric, Standard};
use rand::prelude::*;

use crate::datatypes::Int64Type;
use crate::{
buffer::Buffer,
datatypes::{DataType, Field},
};

use super::*;

macro_rules! def_temporal_test {
Expand Down Expand Up @@ -922,24 +920,6 @@ mod tests {
assert_eq!("world", values.value(d.keys().value(1) as usize));
}

#[test]
fn test_filter_string_array_with_negated_boolean_array() {
let a = StringArray::from(vec!["hello", " ", "world", "!"]);
let mut bb = BooleanBuilder::with_capacity(2);
bb.append_value(false);
bb.append_value(true);
bb.append_value(false);
bb.append_value(true);
let b = bb.finish();
let b = crate::compute::not(&b).unwrap();

let c = filter(&a, &b).unwrap();
let d = c.as_ref().as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(2, d.len());
assert_eq!("hello", d.value(0));
assert_eq!("world", d.value(1));
}

#[test]
fn test_filter_list_array() {
let value_data = ArrayData::builder(DataType::Int32)
Expand Down Expand Up @@ -1027,36 +1007,22 @@ mod tests {
}

#[test]
fn test_null_mask() -> Result<()> {
use crate::compute::kernels::comparison;
let a: PrimitiveArray<Int64Type> =
PrimitiveArray::from(vec![Some(1), Some(2), None]);
let mask0 = comparison::eq(&a, &a)?;
let out0 = filter(&a, &mask0)?;
let out_arr0 = out0
.as_any()
.downcast_ref::<PrimitiveArray<Int64Type>>()
.unwrap();
fn test_null_mask() {
let a = Int64Array::from(vec![Some(1), Some(2), None]);

let mask1 = BooleanArray::from(vec![Some(true), Some(true), None]);
let out1 = filter(&a, &mask1)?;
let out_arr1 = out1
.as_any()
.downcast_ref::<PrimitiveArray<Int64Type>>()
.unwrap();
assert_eq!(mask0, mask1);
assert_eq!(out_arr0, out_arr1);
Ok(())
let out = filter(&a, &mask1).unwrap();
assert_eq!(&out, &a.slice(0, 2));
}

#[test]
fn test_fast_path() -> Result<()> {
fn test_fast_path() {
let a: PrimitiveArray<Int64Type> =
PrimitiveArray::from(vec![Some(1), Some(2), None]);

// all true
let mask = BooleanArray::from(vec![true, true, true]);
let out = filter(&a, &mask)?;
let out = filter(&a, &mask).unwrap();
let b = out
.as_any()
.downcast_ref::<PrimitiveArray<Int64Type>>()
Expand All @@ -1065,10 +1031,9 @@ mod tests {

// all false
let mask = BooleanArray::from(vec![false, false, false]);
let out = filter(&a, &mask)?;
let out = filter(&a, &mask).unwrap();
assert_eq!(out.len(), 0);
assert_eq!(out.data_type(), &DataType::Int64);
Ok(())
}

#[test]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ use arrow_schema::ArrowError;
/// values array 1
/// ```
///
/// For selecting values by index from a single array see [compute::take](crate::compute::take)
/// For selecting values by index from a single array see [`crate::interleave`]
pub fn interleave(
values: &[&dyn Array],
indices: &[(usize, usize)],
Expand Down
22 changes: 22 additions & 0 deletions arrow-select/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Arrow selection kernels

pub mod filter;
pub mod interleave;
pub mod take;

0 comments on commit 9a8b04d

Please sign in to comment.