From 5f441eedff2b7621c46aded8b1caf3b665b8e8a9 Mon Sep 17 00:00:00 2001 From: Artjoms Iskovs Date: Thu, 15 Sep 2022 17:22:53 +0100 Subject: [PATCH 01/16] Fix multipart uploads on Minio (#2731) The official Minio SDK uses "uploads=" as the URL when it initiates a multipart upload instead of "uploads". This affects the AWSV4 signature and causes object_store to fail a signature check when initiating the upload to Minio. It's possible that this contradicts the AWS S3 API docs: https://docs.aws.amazon.com/AmazonS3/latest/API/API_CreateMultipartUpload.html#API_CreateMultipartUpload_RequestSyntax and we need to instead keep the URL as `?uploads` and change the URL that goes into the signature instead. --- object_store/src/aws/client.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_store/src/aws/client.rs b/object_store/src/aws/client.rs index d8ab3bba8f2..f800fec3dc5 100644 --- a/object_store/src/aws/client.rs +++ b/object_store/src/aws/client.rs @@ -411,7 +411,7 @@ impl S3Client { pub async fn create_multipart(&self, location: &Path) -> Result { let credential = self.get_credential().await?; let url = format!( - "{}/{}/{}?uploads", + "{}/{}/{}?uploads=", self.config.endpoint, self.config.bucket, encode_path(location) From a7a93295bd4a143d55fa31a1c6ac92045d73dc05 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 15 Sep 2022 17:23:22 +0100 Subject: [PATCH 02/16] Update read parquet example in parquet/arrow home (#2730) * Update example to read parquet * Remove outdated comment --- parquet/src/arrow/mod.rs | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index c0de656bf9c..c5fe0fa2a62 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -66,26 +66,23 @@ //! # Example of reading parquet file into arrow record batch //! //! ```rust -//! use arrow::record_batch::RecordBatchReader; -//! use parquet::file::reader::{FileReader, SerializedFileReader}; -//! use parquet::arrow::{ParquetFileArrowReader, ArrowReader, ProjectionMask}; -//! use std::sync::Arc; //! use std::fs::File; +//! use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; //! +//! # use std::sync::Arc; //! # use arrow::array::Int32Array; //! # use arrow::datatypes::{DataType, Field, Schema}; //! # use arrow::record_batch::RecordBatch; //! # use parquet::arrow::arrow_writer::ArrowWriter; +//! # //! # let ids = Int32Array::from(vec![1, 2, 3, 4]); //! # let schema = Arc::new(Schema::new(vec![ -//! # Field::new("id", DataType::Int32, false), +//! # Field::new("id", DataType::Int32, false), //! # ])); //! # -//! # // Write to a memory buffer (can also write to a File) //! # let file = File::create("data.parquet").unwrap(); //! # -//! # let batch = -//! # RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(ids)]).unwrap(); +//! # let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(ids)]).unwrap(); //! # let batches = vec![batch]; //! # //! # let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), None).unwrap(); @@ -97,26 +94,14 @@ //! //! let file = File::open("data.parquet").unwrap(); //! -//! let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); -//! let mask = ProjectionMask::leaves(arrow_reader.parquet_schema(), [0]); -//! -//! println!("Converted arrow schema is: {}", arrow_reader.get_schema().unwrap()); -//! println!("Arrow schema after projection is: {}", -//! arrow_reader.get_schema_by_columns(mask.clone()).unwrap()); +//! let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap(); +//! println!("Converted arrow schema is: {}", builder.schema()); //! -//! let mut unprojected = arrow_reader.get_record_reader(2048).unwrap(); -//! println!("Unprojected reader schema: {}", unprojected.schema()); +//! let mut reader = builder.build().unwrap(); //! -//! let mut record_batch_reader = arrow_reader.get_record_reader_by_columns(mask, 2048).unwrap(); +//! let record_batch = reader.next().unwrap().unwrap(); //! -//! for maybe_record_batch in record_batch_reader { -//! let record_batch = maybe_record_batch.unwrap(); -//! if record_batch.num_rows() > 0 { -//! println!("Read {} records.", record_batch.num_rows()); -//! } else { -//! println!("End of file!"); -//! } -//!} +//! println!("Read {} records.", record_batch.num_rows()); //! ``` experimental!(mod array_reader); From eb9b456fdde92d4ca12c7573fb38faf6e6657fc3 Mon Sep 17 00:00:00 2001 From: Kun Liu Date: Fri, 16 Sep 2022 03:12:06 +0800 Subject: [PATCH 03/16] benchmark: bitwise operation (#2718) * add benchmark for bitwise operation * add bench for bitwise or xor not --- arrow/Cargo.toml | 5 ++ arrow/benches/bitwise_kernel.rs | 121 ++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 arrow/benches/bitwise_kernel.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index c66cef61202..e52940b4fc4 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -240,3 +240,8 @@ harness = false name = "row_format" harness = false required-features = ["test_utils"] + +[[bench]] +name = "bitwise_kernel" +harness = false +required-features = ["test_utils"] diff --git a/arrow/benches/bitwise_kernel.rs b/arrow/benches/bitwise_kernel.rs new file mode 100644 index 00000000000..741eb96125a --- /dev/null +++ b/arrow/benches/bitwise_kernel.rs @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; + +use arrow::compute::kernels::bitwise::{ + bitwise_and, bitwise_and_scalar, bitwise_not, bitwise_or, bitwise_or_scalar, + bitwise_xor, bitwise_xor_scalar, +}; +use arrow::datatypes::Int64Type; +use criterion::{black_box, Criterion}; +use rand::RngCore; + +extern crate arrow; + +use arrow::util::bench_util::create_primitive_array; +use arrow::util::test_util::seedable_rng; + +fn bitwise_array_benchmark(c: &mut Criterion) { + let size = 64 * 1024_usize; + let left_without_null = create_primitive_array::(size, 0 as f32); + let right_without_null = create_primitive_array::(size, 0 as f32); + let left_with_null = create_primitive_array::(size, 0.2_f32); + let right_with_null = create_primitive_array::(size, 0.2_f32); + // array and + let mut group = c.benchmark_group("bench bitwise array: and"); + group.bench_function("bitwise array and, no nulls", |b| { + b.iter(|| { + black_box(bitwise_and(&left_without_null, &right_without_null).unwrap()) + }) + }); + group.bench_function("bitwise array and, 20% nulls", |b| { + b.iter(|| black_box(bitwise_and(&left_with_null, &right_with_null).unwrap())) + }); + group.finish(); + // array or + let mut group = c.benchmark_group("bench bitwise: or"); + group.bench_function("bitwise array or, no nulls", |b| { + b.iter(|| black_box(bitwise_or(&left_without_null, &right_without_null).unwrap())) + }); + group.bench_function("bitwise array or, 20% nulls", |b| { + b.iter(|| black_box(bitwise_or(&left_with_null, &right_with_null).unwrap())) + }); + group.finish(); + // xor + let mut group = c.benchmark_group("bench bitwise: xor"); + group.bench_function("bitwise array xor, no nulls", |b| { + b.iter(|| { + black_box(bitwise_xor(&left_without_null, &right_without_null).unwrap()) + }) + }); + group.bench_function("bitwise array xor, 20% nulls", |b| { + b.iter(|| black_box(bitwise_xor(&left_with_null, &right_with_null).unwrap())) + }); + group.finish(); + // not + let mut group = c.benchmark_group("bench bitwise: not"); + group.bench_function("bitwise array not, no nulls", |b| { + b.iter(|| black_box(bitwise_not(&left_without_null).unwrap())) + }); + group.bench_function("bitwise array not, 20% nulls", |b| { + b.iter(|| black_box(bitwise_not(&left_with_null).unwrap())) + }); + group.finish(); +} + +fn bitwise_array_scalar_benchmark(c: &mut Criterion) { + let size = 64 * 1024_usize; + let array_without_null = create_primitive_array::(size, 0 as f32); + let array_with_null = create_primitive_array::(size, 0.2_f32); + let scalar = seedable_rng().next_u64() as i64; + // array scalar and + let mut group = c.benchmark_group("bench bitwise array scalar: and"); + group.bench_function("bitwise array scalar and, no nulls", |b| { + b.iter(|| black_box(bitwise_and_scalar(&array_without_null, scalar).unwrap())) + }); + group.bench_function("bitwise array and, 20% nulls", |b| { + b.iter(|| black_box(bitwise_and_scalar(&array_with_null, scalar).unwrap())) + }); + group.finish(); + // array scalar or + let mut group = c.benchmark_group("bench bitwise array scalar: or"); + group.bench_function("bitwise array scalar or, no nulls", |b| { + b.iter(|| black_box(bitwise_or_scalar(&array_without_null, scalar).unwrap())) + }); + group.bench_function("bitwise array scalar or, 20% nulls", |b| { + b.iter(|| black_box(bitwise_or_scalar(&array_with_null, scalar).unwrap())) + }); + group.finish(); + // array scalar xor + let mut group = c.benchmark_group("bench bitwise array scalar: xor"); + group.bench_function("bitwise array scalar xor, no nulls", |b| { + b.iter(|| black_box(bitwise_xor_scalar(&array_without_null, scalar).unwrap())) + }); + group.bench_function("bitwise array scalar xor, 20% nulls", |b| { + b.iter(|| black_box(bitwise_xor_scalar(&array_with_null, scalar).unwrap())) + }); + group.finish(); +} + +criterion_group!( + benches, + bitwise_array_benchmark, + bitwise_array_scalar_benchmark +); +criterion_main!(benches); From 5238789244be27380347b19b0747c9dcd9938470 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Thu, 15 Sep 2022 15:21:11 -0400 Subject: [PATCH 04/16] Automate updates to `CHANGELOG-old.md` (#2732) * feature complete * fix footer issue * fix duplicate changelog issue * use tac instead of head for head -n - is not universal * adjust blank lines * fix footer dropping * line adj * add .bak2 to gitignore --- .gitignore | 2 +- dev/release/update_change_log.sh | 35 +++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 2a21776aa54..b8506ea06cb 100644 --- a/.gitignore +++ b/.gitignore @@ -20,7 +20,7 @@ __blobstorage__ # .bak files *.bak - +*.bak2 # OS-specific .gitignores # Mac .gitignore diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index 252cd285d92..a3af50a8a6e 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,16 +29,45 @@ set -e -SINCE_TAG="21.0.0" -FUTURE_RELEASE="22.0.0" +SINCE_TAG="22.0.0" +FUTURE_RELEASE="23.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG.md" +OLD_OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG-old.md" # remove license header so github-changelog-generator has a clean base to append -sed -i.bak '1,18d' "${OUTPUT_PATH}" +sed -i.bak '1,21d' "${OUTPUT_PATH}" +sed -i.bak '1,21d' "${OLD_OUTPUT_PATH}" +# remove the github-changelog-generator footer from the old CHANGELOG.md +LINE_COUNT=$(wc -l <"${OUTPUT_PATH}") +sed -i.bak2 "$(( $LINE_COUNT-4+1 )),$ d" "${OUTPUT_PATH}" + +# Copy the previous CHANGELOG.md to CHANGELOG-old.md +echo ' + +# Historical Changelog +' | cat - "${OUTPUT_PATH}" "${OLD_OUTPUT_PATH}" > "${OLD_OUTPUT_PATH}".tmp +mv "${OLD_OUTPUT_PATH}".tmp "${OLD_OUTPUT_PATH}" # use exclude-tags-regex to filter out tags used for object_store # crates and only only look at tags that DO NOT begin with `object_store_` From 0ebd71e0d3d132250a2e5743f24f952c58c236d3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 15 Sep 2022 22:44:22 +0100 Subject: [PATCH 05/16] Partially flatten arrow-buffer (#2737) * Partially flatten arrow-buffer * Format --- arrow-buffer/src/lib.rs | 9 +++++++-- arrow/src/bitmap.rs | 3 +-- arrow/src/datatypes/native.rs | 2 +- arrow/src/util/mod.rs | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index a8aca7c3dad..74d2bd5ec86 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -19,6 +19,11 @@ pub mod alloc; pub mod buffer; +pub use buffer::{Buffer, MutableBuffer}; + mod bytes; -pub mod native; -pub mod util; +mod native; + +pub use native::*; +mod util; +pub use util::*; diff --git a/arrow/src/bitmap.rs b/arrow/src/bitmap.rs index 4491da4632b..dbf9706677a 100644 --- a/arrow/src/bitmap.rs +++ b/arrow/src/bitmap.rs @@ -17,12 +17,11 @@ //! Defines [Bitmap] for tracking validity bitmaps -use crate::buffer::Buffer; use crate::error::{ArrowError, Result}; use crate::util::bit_util; use std::mem; -use arrow_buffer::buffer::{buffer_bin_and, buffer_bin_or}; +use arrow_buffer::buffer::{buffer_bin_and, buffer_bin_or, Buffer}; use std::ops::{BitAnd, BitOr}; #[derive(Debug, Clone)] diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 8c329a066e5..de35c4804fa 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -16,7 +16,7 @@ // under the License. use super::DataType; -pub use arrow_buffer::native::{ArrowNativeType, ToByteSlice}; +pub use arrow_buffer::{ArrowNativeType, ToByteSlice}; use half::f16; /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 5453c11ab8a..adafc9f5053 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -pub use arrow_buffer::util::{bit_chunk_iterator, bit_util}; +pub use arrow_buffer::{bit_chunk_iterator, bit_util}; #[cfg(feature = "test_utils")] pub mod bench_util; From 43d912c010b7374ceb3a632eedda4f55f90545d0 Mon Sep 17 00:00:00 2001 From: askoa <112126368+askoa@users.noreply.github.com> Date: Fri, 16 Sep 2022 05:59:39 -0400 Subject: [PATCH 06/16] Better construction of RecordBatchOptions (#2729) * include builder for RecordBatchOptions * fix clippy warnings * fix clippy warnings * remove builder struct * removed a wrong comment * Update comment in arrow/src/record_batch.rs Co-authored-by: Andrew Lamb * Update comment in arrow/src/record_batch.rs Co-authored-by: Andrew Lamb Co-authored-by: askoa Co-authored-by: Andrew Lamb --- arrow/src/ipc/reader.rs | 12 ++++-------- arrow/src/record_batch.rs | 35 +++++++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index 969c8c43f02..a784f54e20c 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -578,10 +578,7 @@ pub fn read_record_batch( let mut node_index = 0; let mut arrays = vec![]; - let options = RecordBatchOptions { - row_count: Some(batch.length() as usize), - ..Default::default() - }; + let options = RecordBatchOptions::new().with_row_count(Some(batch.length() as usize)); if let Some(projection) = projection { // project fields @@ -1692,10 +1689,9 @@ mod tests { #[test] fn test_no_columns_batch() { let schema = Arc::new(Schema::new(vec![])); - let options = RecordBatchOptions { - match_field_names: true, - row_count: Some(10), - }; + let options = RecordBatchOptions::new() + .with_match_field_names(true) + .with_row_count(Some(10)); let input_batch = RecordBatch::try_new_with_options(schema, vec![], &options).unwrap(); let output_batch = roundtrip_ipc_stream(&input_batch); diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs index 4b0d36a43e5..f71c67fe774 100644 --- a/arrow/src/record_batch.rs +++ b/arrow/src/record_batch.rs @@ -80,7 +80,7 @@ impl RecordBatch { /// # } /// ``` pub fn try_new(schema: SchemaRef, columns: Vec) -> Result { - let options = RecordBatchOptions::default(); + let options = RecordBatchOptions::new(); Self::try_new_impl(schema, columns, &options) } @@ -413,15 +413,29 @@ pub struct RecordBatchOptions { pub row_count: Option, } -impl Default for RecordBatchOptions { - fn default() -> Self { +impl RecordBatchOptions { + pub fn new() -> Self { Self { match_field_names: true, row_count: None, } } + /// Sets the row_count of RecordBatchOptions and returns self + pub fn with_row_count(mut self, row_count: Option) -> Self { + self.row_count = row_count; + self + } + /// Sets the match_field_names of RecordBatchOptions and returns self + pub fn with_match_field_names(mut self, match_field_names: bool) -> Self { + self.match_field_names = match_field_names; + self + } +} +impl Default for RecordBatchOptions { + fn default() -> Self { + Self::new() + } } - impl From<&StructArray> for RecordBatch { /// Create a record batch from struct array, where each field of /// the `StructArray` becomes a `Field` in the schema. @@ -901,10 +915,7 @@ mod tests { .to_string() .contains("must either specify a row count or at least one column")); - let options = RecordBatchOptions { - row_count: Some(10), - ..Default::default() - }; + let options = RecordBatchOptions::new().with_row_count(Some(10)); let ok = RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap(); @@ -929,4 +940,12 @@ mod tests { ); assert_eq!("Invalid argument error: Column 'a' is declared as non-nullable but contains null values", format!("{}", maybe_batch.err().unwrap())); } + #[test] + fn test_record_batch_options() { + let options = RecordBatchOptions::new() + .with_match_field_names(false) + .with_row_count(Some(20)); + assert!(!options.match_field_names); + assert_eq!(options.row_count.unwrap(), 20) + } } From f572ec1bef4a66a00b78f1d80a39992d63444ec2 Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Fri, 16 Sep 2022 18:47:20 +0800 Subject: [PATCH 07/16] Update `try_binary` and `checked_ops`, and remove `math_checked_op` (#2717) * update try_binary delete math_checked_op update the return type of checked ops Signed-off-by: remzi <13716567376yh@gmail.com> * float div not panic on zero Signed-off-by: remzi <13716567376yh@gmail.com> * fix nan test Signed-off-by: remzi <13716567376yh@gmail.com> * add float divide by zero Signed-off-by: remzi <13716567376yh@gmail.com> * add float tests Signed-off-by: remzi <13716567376yh@gmail.com> * fix compile error Signed-off-by: remzi <13716567376yh@gmail.com> Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/Cargo.toml | 2 +- arrow/src/compute/kernels/arithmetic.rs | 220 ++++++++++-------------- arrow/src/compute/kernels/arity.rs | 14 +- arrow/src/datatypes/native.rs | 66 +++++-- 4 files changed, 153 insertions(+), 149 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index e52940b4fc4..1580856dfc0 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -51,7 +51,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"], op indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } -half = { version = "2.0", default-features = false } +half = { version = "2.0", default-features = false, features = ["num-traits"]} hashbrown = { version = "0.12", default-features = false } csv_crate = { version = "1.1", default-features = false, optional = true, package = "csv" } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 04fe2393ec4..7b91a261c7e 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -78,32 +78,6 @@ where Ok(binary(left, right, op)) } -/// This is similar to `math_op` as it performs given operation between two input primitive arrays. -/// But the given operation can return `None` if overflow is detected. For the case, this function -/// returns an `Err`. -fn math_checked_op( - left: &PrimitiveArray, - right: &PrimitiveArray, - op: F, -) -> Result> -where - LT: ArrowNumericType, - RT: ArrowNumericType, - F: Fn(LT::Native, RT::Native) -> Option, -{ - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - try_binary(left, right, |a, b| { - op(a, b).ok_or_else(|| { - ArrowError::ComputeError(format!("Overflow happened on: {:?}, {:?}", a, b)) - }) - }) -} - /// Helper function for operations where a valid `0` on the right array should /// result in an [ArrowError::DivideByZero], namely the division and modulo operations /// @@ -121,26 +95,9 @@ where LT: ArrowNumericType, RT: ArrowNumericType, RT::Native: One + Zero, - F: Fn(LT::Native, RT::Native) -> Option, + F: Fn(LT::Native, RT::Native) -> Result, { - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - try_binary(left, right, |l, r| { - if r.is_zero() { - Err(ArrowError::DivideByZero) - } else { - op(l, r).ok_or_else(|| { - ArrowError::ComputeError(format!( - "Overflow happened on: {:?}, {:?}", - l, r - )) - }) - } - }) + try_binary(left, right, op) } /// Helper function for operations where a valid `0` on the right array should @@ -161,16 +118,12 @@ fn math_checked_divide_op_on_iters( where T: ArrowNumericType, T::Native: One + Zero, - F: Fn(T::Native, T::Native) -> T::Native, + F: Fn(T::Native, T::Native) -> Result, { let buffer = if null_bit_buffer.is_some() { let values = left.zip(right).map(|(left, right)| { if let (Some(l), Some(r)) = (left, right) { - if r.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(op(l, r)) - } + op(l, r) } else { Ok(T::default_value()) } @@ -179,15 +132,10 @@ where unsafe { Buffer::try_from_trusted_len_iter(values) } } else { // no value is null - let values = left.map(|l| l.unwrap()).zip(right.map(|r| r.unwrap())).map( - |(left, right)| { - if right.is_zero() { - Err(ArrowError::DivideByZero) - } else { - Ok(op(left, right)) - } - }, - ); + let values = left + .map(|l| l.unwrap()) + .zip(right.map(|r| r.unwrap())) + .map(|(left, right)| op(left, right)); // Safety: Iterator comes from a PrimitiveArray which reports its size correctly unsafe { Buffer::try_from_trusted_len_iter(values) } }?; @@ -654,7 +602,7 @@ where K: ArrowNumericType, T: ArrowNumericType, T::Native: One + Zero, - F: Fn(T::Native, T::Native) -> T::Native, + F: Fn(T::Native, T::Native) -> Result, { if left.len() != right.len() { return Err(ArrowError::ComputeError(format!( @@ -725,7 +673,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - math_checked_op(left, right, |a, b| a.add_checked(b)) + try_binary(left, right, |a, b| a.add_checked(b)) } /// Perform `left + right` operation on two arrays. If either left or right value is null @@ -826,11 +774,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - try_unary(array, |value| { - value.add_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!("Overflow: adding {:?} to {:?}", scalar, value)) - }) - }) + try_unary(array, |value| value.add_checked(scalar)) } /// Add every value in an array by a scalar. If any value in the array is null then the @@ -863,12 +807,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - try_unary_dyn::<_, T>(array, |value| { - value.add_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!("Overflow: adding {:?} to {:?}", scalar, value)) - }) - }) - .map(|a| Arc::new(a) as ArrayRef) + try_unary_dyn::<_, T>(array, |value| value.add_checked(scalar)) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -900,7 +840,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - math_checked_op(left, right, |a, b| a.sub_checked(b)) + try_binary(left, right, |a, b| a.sub_checked(b)) } /// Perform `left - right` operation on two arrays. If either left or right value is null @@ -953,14 +893,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp + Zero, { - try_unary(array, |value| { - value.sub_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: subtracting {:?} from {:?}", - scalar, value - )) - }) - }) + try_unary(array, |value| value.sub_checked(scalar)) } /// Subtract every value in an array by a scalar. If any value in the array is null then the @@ -991,15 +924,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - try_unary_dyn::<_, T>(array, |value| { - value.sub_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: subtracting {:?} from {:?}", - scalar, value - )) - }) - }) - .map(|a| Arc::new(a) as ArrayRef) + try_unary_dyn::<_, T>(array, |value| value.sub_checked(scalar)) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `-` operation on an array. If value is null then the result is also null. @@ -1052,7 +978,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - math_checked_op(left, right, |a, b| a.mul_checked(b)) + try_binary(left, right, |a, b| a.mul_checked(b)) } /// Perform `left * right` operation on two arrays. If either left or right value is null @@ -1105,14 +1031,7 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp + Zero + One, { - try_unary(array, |value| { - value.mul_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: multiplying {:?} by {:?}", - value, scalar, - )) - }) - }) + try_unary(array, |value| value.mul_checked(scalar)) } /// Multiply every value in an array by a scalar. If any value in the array is null then the @@ -1143,15 +1062,8 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp, { - try_unary_dyn::<_, T>(array, |value| { - value.mul_checked(scalar).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: multiplying {:?} by {:?}", - value, scalar - )) - }) - }) - .map(|a| Arc::new(a) as ArrayRef) + try_unary_dyn::<_, T>(array, |value| value.mul_checked(scalar)) + .map(|a| Arc::new(a) as ArrayRef) } /// Perform `left % right` operation on two arrays. If either left or right value is null @@ -1170,7 +1082,13 @@ where a % b }); #[cfg(not(feature = "simd"))] - return math_checked_divide_op(left, right, |a, b| Some(a % b)); + return try_binary(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a % b) + } + }); } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1225,12 +1143,17 @@ where pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a / b, math_divide_checked_op_dict) + typed_dict_math_op!( + left, + right, + |a, b| a.div_checked(b), + math_divide_checked_op_dict + ) } _ => { downcast_primitive_array!( (left, right) => { - math_checked_divide_op(left, right, |a, b| Some(a / b)).map(|a| Arc::new(a) as ArrayRef) + math_checked_divide_op(left, right, |a, b| a.div_checked(b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -1331,15 +1254,8 @@ where return Err(ArrowError::DivideByZero); } - try_unary_dyn::<_, T>(array, |value| { - value.div_checked(divisor).ok_or_else(|| { - ArrowError::CastError(format!( - "Overflow: dividing {:?} by {:?}", - value, divisor - )) - }) - }) - .map(|a| Arc::new(a) as ArrayRef) + try_unary_dyn::<_, T>(array, |value| value.div_checked(divisor)) + .map(|a| Arc::new(a) as ArrayRef) } #[cfg(test)] @@ -2134,23 +2050,41 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_divide_by_zero_with_checked() { + fn test_int_array_divide_by_zero_with_checked() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); divide_checked(&a, &b).unwrap(); } + #[test] + #[should_panic(expected = "DivideByZero")] + fn test_f32_array_divide_by_zero_with_checked() { + let a = Float32Array::from(vec![15.0]); + let b = Float32Array::from(vec![0.0]); + divide_checked(&a, &b).unwrap(); + } + #[test] #[should_panic(expected = "attempt to divide by zero")] - fn test_primitive_array_divide_by_zero() { + fn test_int_array_divide_by_zero() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); divide(&a, &b).unwrap(); } + #[test] + fn test_f32_array_divide_by_zero() { + let a = Float32Array::from(vec![1.5, 0.0, -1.5]); + let b = Float32Array::from(vec![0.0, 0.0, 0.0]); + let result = divide(&a, &b).unwrap(); + assert_eq!(result.value(0), f32::INFINITY); + assert!(result.value(1).is_nan()); + assert_eq!(result.value(2), f32::NEG_INFINITY); + } + #[test] #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_divide_dyn_by_zero() { + fn test_int_array_divide_dyn_by_zero() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); divide_dyn(&a, &b).unwrap(); @@ -2158,7 +2092,15 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_divide_dyn_by_zero_dict() { + fn test_f32_array_divide_dyn_by_zero() { + let a = Float32Array::from(vec![1.5]); + let b = Float32Array::from(vec![0.0]); + divide_dyn(&a, &b).unwrap(); + } + + #[test] + #[should_panic(expected = "DivideByZero")] + fn test_int_array_divide_dyn_by_zero_dict() { let mut builder = PrimitiveDictionaryBuilder::::with_capacity(1, 1); builder.append(15).unwrap(); @@ -2174,14 +2116,38 @@ mod tests { #[test] #[should_panic(expected = "DivideByZero")] - fn test_primitive_array_modulus_by_zero() { + fn test_f32_dict_array_divide_dyn_by_zero() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(1.5).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(0.0).unwrap(); + let b = builder.finish(); + + divide_dyn(&a, &b).unwrap(); + } + + #[test] + #[should_panic(expected = "DivideByZero")] + fn test_i32_array_modulus_by_zero() { let a = Int32Array::from(vec![15]); let b = Int32Array::from(vec![0]); modulus(&a, &b).unwrap(); } #[test] - fn test_primitive_array_divide_f64() { + #[should_panic(expected = "DivideByZero")] + fn test_f32_array_modulus_by_zero() { + let a = Float32Array::from(vec![1.5]); + let b = Float32Array::from(vec![0.0]); + modulus(&a, &b).unwrap(); + } + + #[test] + fn test_f64_array_divide() { let a = Float64Array::from(vec![15.0, 15.0, 8.0]); let b = Float64Array::from(vec![5.0, 6.0, 8.0]); let c = divide(&a, &b).unwrap(); diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 21c633116ee..5060234c71b 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -261,9 +261,10 @@ where /// /// Like [`try_unary`] the function is only evaluated for non-null indices /// -/// # Panic +/// # Error /// -/// Panics if the arrays have different lengths +/// Return an error if the arrays have different lengths or +/// the operation is under erroneous pub fn try_binary( a: &PrimitiveArray, b: &PrimitiveArray, @@ -275,13 +276,16 @@ where O: ArrowPrimitiveType, F: Fn(A::Native, B::Native) -> Result, { - assert_eq!(a.len(), b.len()); - let len = a.len(); - + if a.len() != b.len() { + return Err(ArrowError::ComputeError( + "Cannot perform a binary operation on arrays of different length".to_string(), + )); + } if a.is_empty() { return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } + let len = a.len(); let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); let null_count = null_buffer .as_ref() diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index de35c4804fa..dec0cc4b53b 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -16,8 +16,10 @@ // under the License. use super::DataType; +use crate::error::{ArrowError, Result}; pub use arrow_buffer::{ArrowNativeType, ToByteSlice}; use half::f16; +use num::Zero; /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the /// static-typed nature of rust types ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. @@ -43,6 +45,8 @@ pub trait ArrowPrimitiveType: 'static { pub(crate) mod native_op { use super::ArrowNativeType; + use crate::error::{ArrowError, Result}; + use num::Zero; use std::ops::{Add, Div, Mul, Sub}; /// Trait for ArrowNativeType to provide overflow-checking and non-overflow-checking @@ -61,33 +65,38 @@ pub(crate) mod native_op { + Sub + Mul + Div + + Zero { - fn add_checked(self, rhs: Self) -> Option { - Some(self + rhs) + fn add_checked(self, rhs: Self) -> Result { + Ok(self + rhs) } fn add_wrapping(self, rhs: Self) -> Self { self + rhs } - fn sub_checked(self, rhs: Self) -> Option { - Some(self - rhs) + fn sub_checked(self, rhs: Self) -> Result { + Ok(self - rhs) } fn sub_wrapping(self, rhs: Self) -> Self { self - rhs } - fn mul_checked(self, rhs: Self) -> Option { - Some(self * rhs) + fn mul_checked(self, rhs: Self) -> Result { + Ok(self * rhs) } fn mul_wrapping(self, rhs: Self) -> Self { self * rhs } - fn div_checked(self, rhs: Self) -> Option { - Some(self / rhs) + fn div_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(self / rhs) + } } fn div_wrapping(self, rhs: Self) -> Self { @@ -99,32 +108,56 @@ pub(crate) mod native_op { macro_rules! native_type_op { ($t:tt) => { impl native_op::ArrowNativeTypeOp for $t { - fn add_checked(self, rhs: Self) -> Option { - self.checked_add(rhs) + fn add_checked(self, rhs: Self) -> Result { + self.checked_add(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} + {:?}", + self, rhs + )) + }) } fn add_wrapping(self, rhs: Self) -> Self { self.wrapping_add(rhs) } - fn sub_checked(self, rhs: Self) -> Option { - self.checked_sub(rhs) + fn sub_checked(self, rhs: Self) -> Result { + self.checked_sub(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} - {:?}", + self, rhs + )) + }) } fn sub_wrapping(self, rhs: Self) -> Self { self.wrapping_sub(rhs) } - fn mul_checked(self, rhs: Self) -> Option { - self.checked_mul(rhs) + fn mul_checked(self, rhs: Self) -> Result { + self.checked_mul(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} * {:?}", + self, rhs + )) + }) } fn mul_wrapping(self, rhs: Self) -> Self { self.wrapping_mul(rhs) } - fn div_checked(self, rhs: Self) -> Option { - self.checked_div(rhs) + fn div_checked(self, rhs: Self) -> Result { + if rhs.is_zero() { + Err(ArrowError::DivideByZero) + } else { + self.checked_div(rhs).ok_or_else(|| { + ArrowError::ComputeError(format!( + "Overflow happened on: {:?} / {:?}", + self, rhs + )) + }) + } } fn div_wrapping(self, rhs: Self) -> Self { @@ -138,6 +171,7 @@ native_type_op!(i8); native_type_op!(i16); native_type_op!(i32); native_type_op!(i64); +native_type_op!(i128); native_type_op!(u8); native_type_op!(u16); native_type_op!(u32); From 968a7673c7e1341431bc4d55a4f50e9fa6aff7d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Fri, 16 Sep 2022 14:52:48 +0200 Subject: [PATCH 08/16] Speedup string equal/not equal to empty string, cleanup like/ilike kernels, fix escape bug (#2743) * Speedup string == "" * neq too * Simplify kernels * Simplify kernels * Fix test * Escape contains * Fmt * Fix --- arrow/benches/equal.rs | 10 ++ arrow/src/compute/kernels/comparison.rs | 216 +++++++++--------------- 2 files changed, 89 insertions(+), 137 deletions(-) diff --git a/arrow/benches/equal.rs b/arrow/benches/equal.rs index af535506e86..f54aff1b5cc 100644 --- a/arrow/benches/equal.rs +++ b/arrow/benches/equal.rs @@ -20,6 +20,7 @@ #[macro_use] extern crate criterion; +use arrow::compute::eq_utf8_scalar; use criterion::Criterion; extern crate arrow; @@ -31,6 +32,10 @@ fn bench_equal>(arr_a: &A) { criterion::black_box(arr_a == arr_a); } +fn bench_equal_utf8_scalar(arr_a: &GenericStringArray, right: &str) { + criterion::black_box(eq_utf8_scalar(arr_a, right).unwrap()); +} + fn add_benchmark(c: &mut Criterion) { let arr_a = create_primitive_array::(512, 0.0); c.bench_function("equal_512", |b| b.iter(|| bench_equal(&arr_a))); @@ -41,6 +46,11 @@ fn add_benchmark(c: &mut Criterion) { let arr_a = create_string_array::(512, 0.0); c.bench_function("equal_string_512", |b| b.iter(|| bench_equal(&arr_a))); + let arr_a = create_string_array::(512, 0.0); + c.bench_function("equal_string_scalar_empty_512", |b| { + b.iter(|| bench_equal_utf8_scalar(&arr_a, "")) + }); + let arr_a_nulls = create_string_array::(512, 0.5); c.bench_function("equal_string_nulls_512", |b| { b.iter(|| bench_equal(&arr_a_nulls)) diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 5a79c2e82df..d4eb5a3e1d2 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -233,61 +233,35 @@ pub fn like_utf8( } #[inline] -fn like_scalar<'a, L: ArrayAccessor>( +fn like_scalar_op<'a, F: Fn(bool) -> bool, L: ArrayAccessor>( left: L, right: &str, + op: F, ) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - if !right.contains(is_like_pattern) { // fast path, can use equals - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i) == right { - bit_util::set_bit(bool_slice, i); - } - } - } + compare_op_scalar(left, |item| op(item == right)) } else if right.ends_with('%') && !right.ends_with("\\%") && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use starts_with let starts_with = &right[..right.len() - 1]; - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).starts_with(starts_with) { - bit_util::set_bit(bool_slice, i); - } - } - } + + compare_op_scalar(left, |item| op(item.starts_with(starts_with))) } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { // fast path, can use ends_with let ends_with = &right[1..]; - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).ends_with(ends_with) { - bit_util::set_bit(bool_slice, i); - } - } - } + compare_op_scalar(left, |item| op(item.ends_with(ends_with))) } else if right.starts_with('%') && right.ends_with('%') + && !right.ends_with("\\%") && !right[1..right.len() - 1].contains(is_like_pattern) { - // fast path, can use contains let contains = &right[1..right.len() - 1]; - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i).contains(contains) { - bit_util::set_bit(bool_slice, i); - } - } - } + + compare_op_scalar(left, |item| op(item.contains(contains))) } else { let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { @@ -297,26 +271,16 @@ fn like_scalar<'a, L: ArrayAccessor>( )) })?; - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; + compare_op_scalar(left, |item| op(re.is_match(item))) + } +} - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) +#[inline] +fn like_scalar<'a, L: ArrayAccessor>( + left: L, + right: &str, +) -> Result { + like_scalar_op(left, right, |x| x) } /// Perform SQL `left LIKE right` operation on [`StringArray`] / @@ -415,86 +379,7 @@ fn nlike_scalar<'a, L: ArrayAccessor>( left: L, right: &str, ) -> Result { - let null_bit_buffer = left.data().null_buffer().cloned(); - let bytes = bit_util::ceil(left.len(), 8); - let mut bool_buf = MutableBuffer::from_len_zeroed(bytes); - let bool_slice = bool_buf.as_slice_mut(); - - if !right.contains(is_like_pattern) { - // fast path, can use equals - for i in 0..left.len() { - unsafe { - if left.value_unchecked(i) != right { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.ends_with('%') - && !right.ends_with("\\%") - && !right[..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use starts_with - let starts_with = &right[..right.len() - 1]; - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).starts_with(starts_with)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') && !right[1..].contains(is_like_pattern) { - // fast path, can use ends_with - let ends_with = &right[1..]; - - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).ends_with(ends_with)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else if right.starts_with('%') - && right.ends_with('%') - && !right[1..right.len() - 1].contains(is_like_pattern) - { - // fast path, can use contains - let contains = &right[1..right.len() - 1]; - for i in 0..left.len() { - unsafe { - if !(left.value_unchecked(i).contains(contains)) { - bit_util::set_bit(bool_slice, i); - } - } - } - } else { - let re_pattern = replace_like_wildcards(right)?; - let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { - ArrowError::ComputeError(format!( - "Unable to build regex from LIKE pattern: {}", - e - )) - })?; - - for i in 0..left.len() { - let haystack = unsafe { left.value_unchecked(i) }; - if !re.is_match(haystack) { - bit_util::set_bit(bool_slice, i); - } - } - }; - - let data = unsafe { - ArrayData::new_unchecked( - DataType::Boolean, - left.len(), - None, - null_bit_buffer, - 0, - vec![bool_buf.into()], - vec![], - ) - }; - Ok(BooleanArray::from(data)) + like_scalar_op(left, right, |x| !x) } /// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / @@ -966,11 +851,48 @@ pub fn eq_utf8( compare_op(left, right, |a, b| a == b) } +fn utf8_empty( + left: &GenericStringArray, +) -> Result { + let null_bit_buffer = left + .data() + .null_buffer() + .map(|b| b.bit_slice(left.offset(), left.len())); + + let buffer = unsafe { + MutableBuffer::from_trusted_len_iter_bool(left.value_offsets().windows(2).map( + |offset| { + if EQ { + offset[1].to_usize().unwrap() == offset[0].to_usize().unwrap() + } else { + offset[1].to_usize().unwrap() > offset[0].to_usize().unwrap() + } + }, + )) + }; + + let data = unsafe { + ArrayData::new_unchecked( + DataType::Boolean, + left.len(), + None, + null_bit_buffer, + 0, + vec![Buffer::from(buffer)], + vec![], + ) + }; + Ok(BooleanArray::from(data)) +} + /// Perform `left == right` operation on [`StringArray`] / [`LargeStringArray`] and a scalar. pub fn eq_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { + if right.is_empty() { + return utf8_empty::<_, true>(left); + } compare_op_scalar(left, |a| a == right) } @@ -1167,6 +1089,9 @@ pub fn neq_utf8_scalar( left: &GenericStringArray, right: &str, ) -> Result { + if right.is_empty() { + return utf8_empty::<_, false>(left); + } compare_op_scalar(left, |a| a != right) } @@ -4324,13 +4249,22 @@ mod tests { #[test] fn test_utf8_eq_scalar_on_slice() { - let a = StringArray::from(vec![Some("hi"), None, Some("hello"), Some("world")]); - let a = a.slice(1, 3); + let a = StringArray::from( + vec![Some("hi"), None, Some("hello"), Some("world"), Some("")], + ); + let a = a.slice(1, 4); let a = as_string_array(&a); let a_eq = eq_utf8_scalar(a, "hello").unwrap(); assert_eq!( a_eq, - BooleanArray::from(vec![None, Some(true), Some(false)]) + BooleanArray::from(vec![None, Some(true), Some(false), Some(false)]) + ); + + let a_eq2 = eq_utf8_scalar(a, "").unwrap(); + + assert_eq!( + a_eq2, + BooleanArray::from(vec![None, Some(false), Some(false), Some(true)]) ); } @@ -4528,6 +4462,14 @@ mod tests { vec![true, false] ); + test_utf8_scalar!( + test_utf8_scalar_like_escape_contains, + vec!["ba%", "ba\\x"], + "%a\\%", + like_utf8_scalar, + vec![true, false] + ); + test_utf8!( test_utf8_scalar_ilike_regex, vec!["%%%"], From 1da2bfbc82de12ac6fb699d2579d4a129929e004 Mon Sep 17 00:00:00 2001 From: Ian Alexander Joiner Date: Fri, 16 Sep 2022 12:23:29 -0400 Subject: [PATCH 09/16] Update version to `23.0.0` and update `CHANGELOG`, add `label_issue.py` script (#2734) * feature complete * fix footer issue * fix duplicate changelog issue * use tac instead of head for head -n - is not universal * adjust blank lines * fix footer dropping * line adj * add .bak2 to gitignore * Create changelog * Update version * Add initial relabeling script * more script * tweaks * Runnable as a script * Update changelog * updates * remove overzealous api change labeling Co-authored-by: Andrew Lamb --- CHANGELOG-old.md | 115 +++++++++++- CHANGELOG.md | 183 +++++++++---------- arrow-flight/Cargo.toml | 4 +- arrow-flight/README.md | 2 +- arrow-pyarrow-integration-testing/Cargo.toml | 4 +- arrow/Cargo.toml | 2 +- arrow/README.md | 4 +- dev/release/README.md | 2 +- dev/release/label_issues.py | 153 ++++++++++++++++ integration-testing/Cargo.toml | 2 +- parquet/Cargo.toml | 6 +- parquet_derive/Cargo.toml | 4 +- parquet_derive/README.md | 4 +- parquet_derive_test/Cargo.toml | 6 +- 14 files changed, 373 insertions(+), 118 deletions(-) create mode 100755 dev/release/label_issues.py diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index 70322b5cfd1..02cb7ec2449 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -17,9 +17,122 @@ under the License. --> - # Historical Changelog +## [22.0.0](https://github.com/apache/arrow-rs/tree/22.0.0) (2022-09-02) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/21.0.0...22.0.0) + +**Breaking changes:** + +- Use `total_cmp` for floating value ordering and remove `nan_ordering` feature flag [\#2614](https://github.com/apache/arrow-rs/pull/2614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Gate dyn comparison of dictionary arrays behind `dyn_cmp_dict` [\#2597](https://github.com/apache/arrow-rs/pull/2597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Move JsonSerializable to json module \(\#2300\) [\#2595](https://github.com/apache/arrow-rs/pull/2595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Decimal precision scale datatype change [\#2532](https://github.com/apache/arrow-rs/pull/2532) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactor PrimitiveBuilder Constructors [\#2518](https://github.com/apache/arrow-rs/pull/2518) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactoring DecimalBuilder constructors [\#2517](https://github.com/apache/arrow-rs/pull/2517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactor FixedSizeBinaryBuilder Constructors [\#2516](https://github.com/apache/arrow-rs/pull/2516) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactor BooleanBuilder Constructors [\#2515](https://github.com/apache/arrow-rs/pull/2515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Refactor UnionBuilder Constructors [\#2488](https://github.com/apache/arrow-rs/pull/2488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) + +**Implemented enhancements:** + +- Add Macros to assist with static dispatch [\#2635](https://github.com/apache/arrow-rs/issues/2635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support comparison between DictionaryArray and BooleanArray [\#2617](https://github.com/apache/arrow-rs/issues/2617) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `total_cmp` for floating value ordering and remove `nan_ordering` feature flag [\#2613](https://github.com/apache/arrow-rs/issues/2613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support empty projection in CSV, JSON readers [\#2603](https://github.com/apache/arrow-rs/issues/2603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support SQL-compliant NaN ordering between for DictionaryArray and non-DictionaryArray [\#2599](https://github.com/apache/arrow-rs/issues/2599) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `dyn_cmp_dict` feature flag to gate dyn comparison of dictionary arrays [\#2596](https://github.com/apache/arrow-rs/issues/2596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add max\_dyn and min\_dyn for max/min for dictionary array [\#2584](https://github.com/apache/arrow-rs/issues/2584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Allow FlightSQL implementers to extend `do_get()` [\#2581](https://github.com/apache/arrow-rs/issues/2581) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Support SQL-compliant behavior on `eq_dyn`, `neq_dyn`, `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#2569](https://github.com/apache/arrow-rs/issues/2569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add sql-compliant feature for enabling sql-compliant kernel behavior [\#2568](https://github.com/apache/arrow-rs/issues/2568) +- Calculate `sum` for dictionary array [\#2565](https://github.com/apache/arrow-rs/issues/2565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add test for float nan comparison [\#2556](https://github.com/apache/arrow-rs/issues/2556) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with string array [\#2548](https://github.com/apache/arrow-rs/issues/2548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with primitive array in `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#2538](https://github.com/apache/arrow-rs/issues/2538) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with primitive array in `eq_dyn` and `neq_dyn` [\#2535](https://github.com/apache/arrow-rs/issues/2535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- UnionBuilder Create Children With Capacity [\#2523](https://github.com/apache/arrow-rs/issues/2523) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speed up `like_utf8_scalar` for `%pat%` [\#2519](https://github.com/apache/arrow-rs/issues/2519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Replace macro with TypedDictionaryArray in comparison kernels [\#2513](https://github.com/apache/arrow-rs/issues/2513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use same codebase for boolean kernels [\#2507](https://github.com/apache/arrow-rs/issues/2507) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use u8 for Decimal Precision and Scale [\#2496](https://github.com/apache/arrow-rs/issues/2496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Integrate skip row without pageIndex in SerializedPageReader in Fuzz Test [\#2475](https://github.com/apache/arrow-rs/issues/2475) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Avoid unecessary copies in Arrow IPC reader [\#2437](https://github.com/apache/arrow-rs/issues/2437) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add GenericColumnReader::skip\_records Missing OffsetIndex Fallback [\#2433](https://github.com/apache/arrow-rs/issues/2433) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support Reading PageIndex with ParquetRecordBatchStream [\#2430](https://github.com/apache/arrow-rs/issues/2430) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Specialize FixedLenByteArrayReader for Parquet [\#2318](https://github.com/apache/arrow-rs/issues/2318) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make JSON support Optional via Feature Flag [\#2300](https://github.com/apache/arrow-rs/issues/2300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Fixed bugs:** + +- Casting timestamp array to string should not ignore timezone [\#2607](https://github.com/apache/arrow-rs/issues/2607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Ilike\_ut8\_scalar kernals have incorrect logic [\#2544](https://github.com/apache/arrow-rs/issues/2544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Always validate the array data when creating array in IPC reader [\#2541](https://github.com/apache/arrow-rs/issues/2541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Int96Converter Truncates Timestamps [\#2480](https://github.com/apache/arrow-rs/issues/2480) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Error Reading Page Index When Not Available [\#2434](https://github.com/apache/arrow-rs/issues/2434) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `ParquetFileArrowReader::get_record_reader[_by_colum]` `batch_size` overallocates [\#2321](https://github.com/apache/arrow-rs/issues/2321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- Document All Arrow Features in docs.rs [\#2633](https://github.com/apache/arrow-rs/issues/2633) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Closed issues:** + +- Add support for CAST from `Interval(DayTime)` to `Timestamp(Nanosecond, None)` [\#2606](https://github.com/apache/arrow-rs/issues/2606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Why do we check for null in TypedDictionaryArray value function [\#2564](https://github.com/apache/arrow-rs/issues/2564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add the `length` field for `Buffer` [\#2524](https://github.com/apache/arrow-rs/issues/2524) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Avoid large over allocate buffer in async reader [\#2512](https://github.com/apache/arrow-rs/issues/2512) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Rewriting Decimal Builders using `const_generic`. [\#2390](https://github.com/apache/arrow-rs/issues/2390) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Rewrite Decimal Array using `const_generic` [\#2384](https://github.com/apache/arrow-rs/issues/2384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] + +**Merged pull requests:** + +- Add downcast macros \(\#2635\) [\#2636](https://github.com/apache/arrow-rs/pull/2636) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Document all arrow features in docs.rs \(\#2633\) [\#2634](https://github.com/apache/arrow-rs/pull/2634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Document dyn\_cmp\_dict [\#2624](https://github.com/apache/arrow-rs/pull/2624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support comparison between DictionaryArray and BooleanArray [\#2618](https://github.com/apache/arrow-rs/pull/2618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Cast timestamp array to string array with timezone [\#2608](https://github.com/apache/arrow-rs/pull/2608) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support empty projection in CSV and JSON readers [\#2604](https://github.com/apache/arrow-rs/pull/2604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Make JSON support optional via a feature flag \(\#2300\) [\#2601](https://github.com/apache/arrow-rs/pull/2601) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support SQL-compliant NaN ordering for DictionaryArray and non-DictionaryArray [\#2600](https://github.com/apache/arrow-rs/pull/2600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Split out integration test plumbing \(\#2594\) \(\#2300\) [\#2598](https://github.com/apache/arrow-rs/pull/2598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Refactor Binary Builder and String Builder Constructors [\#2592](https://github.com/apache/arrow-rs/pull/2592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Dictionary like scalar kernels [\#2591](https://github.com/apache/arrow-rs/pull/2591) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Validate dictionary key in TypedDictionaryArray \(\#2578\) [\#2589](https://github.com/apache/arrow-rs/pull/2589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add max\_dyn and min\_dyn for max/min for dictionary array [\#2585](https://github.com/apache/arrow-rs/pull/2585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Code cleanup of array value functions [\#2583](https://github.com/apache/arrow-rs/pull/2583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Allow overriding of do\_get & export useful macro [\#2582](https://github.com/apache/arrow-rs/pull/2582) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) +- MINOR: Upgrade to pyo3 0.17 [\#2576](https://github.com/apache/arrow-rs/pull/2576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) +- Support SQL-compliant NaN behavior on eq\_dyn, neq\_dyn, lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn [\#2570](https://github.com/apache/arrow-rs/pull/2570) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add sum\_dyn to calculate sum for dictionary array [\#2566](https://github.com/apache/arrow-rs/pull/2566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- struct UnionBuilder will create child buffers with capacity [\#2560](https://github.com/apache/arrow-rs/pull/2560) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kastolars](https://github.com/kastolars)) +- Don't panic on RleValueEncoder::flush\_buffer if empty \(\#2558\) [\#2559](https://github.com/apache/arrow-rs/pull/2559) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add the `length` field for Buffer and use more `Buffer` in IPC reader to avoid memory copy. [\#2557](https://github.com/apache/arrow-rs/pull/2557) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([HaoYang670](https://github.com/HaoYang670)) +- Add test for float nan comparison [\#2555](https://github.com/apache/arrow-rs/pull/2555) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Compare dictionary array with string array [\#2549](https://github.com/apache/arrow-rs/pull/2549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Always validate the array data \(except the `Decimal`\) when creating array in IPC reader [\#2547](https://github.com/apache/arrow-rs/pull/2547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- MINOR: Fix test\_row\_type\_validation test [\#2546](https://github.com/apache/arrow-rs/pull/2546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix ilike\_utf8\_scalar kernals [\#2545](https://github.com/apache/arrow-rs/pull/2545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- fix typo [\#2540](https://github.com/apache/arrow-rs/pull/2540) ([00Masato](https://github.com/00Masato)) +- Compare dictionary array and primitive array in lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn kernels [\#2539](https://github.com/apache/arrow-rs/pull/2539) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- \[MINOR\]Avoid large over allocate buffer in async reader [\#2537](https://github.com/apache/arrow-rs/pull/2537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Compare dictionary with primitive array in `eq_dyn` and `neq_dyn` [\#2533](https://github.com/apache/arrow-rs/pull/2533) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add iterator for FixedSizeBinaryArray [\#2531](https://github.com/apache/arrow-rs/pull/2531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- add bench: decimal with byte array and fixed length byte array [\#2529](https://github.com/apache/arrow-rs/pull/2529) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) +- Add FixedLengthByteArrayReader Remove ComplexObjectArrayReader [\#2528](https://github.com/apache/arrow-rs/pull/2528) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Split out byte array decoders \(\#2318\) [\#2527](https://github.com/apache/arrow-rs/pull/2527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use offset index in ParquetRecordBatchStream [\#2526](https://github.com/apache/arrow-rs/pull/2526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Clean the `create_array` in IPC reader. [\#2525](https://github.com/apache/arrow-rs/pull/2525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Remove DecimalByteArrayConvert \(\#2480\) [\#2522](https://github.com/apache/arrow-rs/pull/2522) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Improve performance of `%pat%` \(\>3x speedup\) [\#2521](https://github.com/apache/arrow-rs/pull/2521) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- remove len field from MapBuilder [\#2520](https://github.com/apache/arrow-rs/pull/2520) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Replace macro with TypedDictionaryArray in comparison kernels [\#2514](https://github.com/apache/arrow-rs/pull/2514) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Avoid large over allocate buffer in sync reader [\#2511](https://github.com/apache/arrow-rs/pull/2511) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Avoid useless memory copies in IPC reader. [\#2510](https://github.com/apache/arrow-rs/pull/2510) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Refactor boolean kernels to use same codebase [\#2508](https://github.com/apache/arrow-rs/pull/2508) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Remove Int96Converter \(\#2480\) [\#2481](https://github.com/apache/arrow-rs/pull/2481) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + ## [21.0.0](https://github.com/apache/arrow-rs/tree/21.0.0) (2022-08-18) [Full Changelog](https://github.com/apache/arrow-rs/compare/20.0.0...21.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69f2b8af6cf..4a063594dc9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,119 +19,108 @@ # Changelog -## [22.0.0](https://github.com/apache/arrow-rs/tree/22.0.0) (2022-09-02) +## [23.0.0](https://github.com/apache/arrow-rs/tree/23.0.0) (2022-09-16) -[Full Changelog](https://github.com/apache/arrow-rs/compare/21.0.0...22.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/22.0.0...23.0.0) **Breaking changes:** -- Use `total_cmp` for floating value ordering and remove `nan_ordering` feature flag [\#2614](https://github.com/apache/arrow-rs/pull/2614) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Gate dyn comparison of dictionary arrays behind `dyn_cmp_dict` [\#2597](https://github.com/apache/arrow-rs/pull/2597) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Move JsonSerializable to json module \(\#2300\) [\#2595](https://github.com/apache/arrow-rs/pull/2595) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Decimal precision scale datatype change [\#2532](https://github.com/apache/arrow-rs/pull/2532) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactor PrimitiveBuilder Constructors [\#2518](https://github.com/apache/arrow-rs/pull/2518) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactoring DecimalBuilder constructors [\#2517](https://github.com/apache/arrow-rs/pull/2517) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactor FixedSizeBinaryBuilder Constructors [\#2516](https://github.com/apache/arrow-rs/pull/2516) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactor BooleanBuilder Constructors [\#2515](https://github.com/apache/arrow-rs/pull/2515) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Refactor UnionBuilder Constructors [\#2488](https://github.com/apache/arrow-rs/pull/2488) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) +- Move JSON Test Format To integration-testing [\#2724](https://github.com/apache/arrow-rs/pull/2724) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Split out arrow-buffer crate \(\#2594\) [\#2693](https://github.com/apache/arrow-rs/pull/2693) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Simplify DictionaryBuilder constructors \(\#2684\) \(\#2054\) [\#2685](https://github.com/apache/arrow-rs/pull/2685) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Deprecate RecordBatch::concat replace with concat\_batches \(\#2594\) [\#2683](https://github.com/apache/arrow-rs/pull/2683) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add overflow-checking variant for primitive arithmetic kernels and explicitly define overflow behavior [\#2643](https://github.com/apache/arrow-rs/pull/2643) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Update thrift v0.16 and vendor parquet-format \(\#2502\) [\#2626](https://github.com/apache/arrow-rs/pull/2626) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update flight definitions including backwards-incompatible change to GetSchema [\#2586](https://github.com/apache/arrow-rs/pull/2586) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([liukun4515](https://github.com/liukun4515)) **Implemented enhancements:** -- Add Macros to assist with static dispatch [\#2635](https://github.com/apache/arrow-rs/issues/2635) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support comparison between DictionaryArray and BooleanArray [\#2617](https://github.com/apache/arrow-rs/issues/2617) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use `total_cmp` for floating value ordering and remove `nan_ordering` feature flag [\#2613](https://github.com/apache/arrow-rs/issues/2613) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support empty projection in CSV, JSON readers [\#2603](https://github.com/apache/arrow-rs/issues/2603) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support SQL-compliant NaN ordering between for DictionaryArray and non-DictionaryArray [\#2599](https://github.com/apache/arrow-rs/issues/2599) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add `dyn_cmp_dict` feature flag to gate dyn comparison of dictionary arrays [\#2596](https://github.com/apache/arrow-rs/issues/2596) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add max\_dyn and min\_dyn for max/min for dictionary array [\#2584](https://github.com/apache/arrow-rs/issues/2584) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Allow FlightSQL implementers to extend `do_get()` [\#2581](https://github.com/apache/arrow-rs/issues/2581) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] -- Support SQL-compliant behavior on `eq_dyn`, `neq_dyn`, `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#2569](https://github.com/apache/arrow-rs/issues/2569) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add sql-compliant feature for enabling sql-compliant kernel behavior [\#2568](https://github.com/apache/arrow-rs/issues/2568) -- Calculate `sum` for dictionary array [\#2565](https://github.com/apache/arrow-rs/issues/2565) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add test for float nan comparison [\#2556](https://github.com/apache/arrow-rs/issues/2556) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compare dictionary with string array [\#2548](https://github.com/apache/arrow-rs/issues/2548) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compare dictionary with primitive array in `lt_dyn`, `lt_eq_dyn`, `gt_dyn`, `gt_eq_dyn` [\#2538](https://github.com/apache/arrow-rs/issues/2538) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Compare dictionary with primitive array in `eq_dyn` and `neq_dyn` [\#2535](https://github.com/apache/arrow-rs/issues/2535) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- UnionBuilder Create Children With Capacity [\#2523](https://github.com/apache/arrow-rs/issues/2523) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Speed up `like_utf8_scalar` for `%pat%` [\#2519](https://github.com/apache/arrow-rs/issues/2519) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Replace macro with TypedDictionaryArray in comparison kernels [\#2513](https://github.com/apache/arrow-rs/issues/2513) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use same codebase for boolean kernels [\#2507](https://github.com/apache/arrow-rs/issues/2507) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Use u8 for Decimal Precision and Scale [\#2496](https://github.com/apache/arrow-rs/issues/2496) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Integrate skip row without pageIndex in SerializedPageReader in Fuzz Test [\#2475](https://github.com/apache/arrow-rs/issues/2475) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Avoid unecessary copies in Arrow IPC reader [\#2437](https://github.com/apache/arrow-rs/issues/2437) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add GenericColumnReader::skip\_records Missing OffsetIndex Fallback [\#2433](https://github.com/apache/arrow-rs/issues/2433) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support Reading PageIndex with ParquetRecordBatchStream [\#2430](https://github.com/apache/arrow-rs/issues/2430) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Specialize FixedLenByteArrayReader for Parquet [\#2318](https://github.com/apache/arrow-rs/issues/2318) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Make JSON support Optional via Feature Flag [\#2300](https://github.com/apache/arrow-rs/issues/2300) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cleanup like and nlike utf8 kernels [\#2744](https://github.com/apache/arrow-rs/issues/2744) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Speedup eq and neq kernels for utf8 arrays [\#2742](https://github.com/apache/arrow-rs/issues/2742) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- API for more ergonomic construction of `RecordBatchOptions` [\#2728](https://github.com/apache/arrow-rs/issues/2728) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Automate updates to `CHANGELOG-old.md` [\#2726](https://github.com/apache/arrow-rs/issues/2726) +- Don't check the `DivideByZero` error for float modulus [\#2720](https://github.com/apache/arrow-rs/issues/2720) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `try_binary` should not panic on unequaled array length. [\#2715](https://github.com/apache/arrow-rs/issues/2715) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add benchmark for bitwise operation [\#2714](https://github.com/apache/arrow-rs/issues/2714) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variants of arithmetic scalar dyn kernels [\#2712](https://github.com/apache/arrow-rs/issues/2712) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add divide\_opt kernel which produce null values on division by zero error [\#2709](https://github.com/apache/arrow-rs/issues/2709) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `DataType` function to detect nested types [\#2704](https://github.com/apache/arrow-rs/issues/2704) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add support of sorting dictionary of other primitive types [\#2700](https://github.com/apache/arrow-rs/issues/2700) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Sort indices of dictionary string values [\#2697](https://github.com/apache/arrow-rs/issues/2697) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support empty projection in `RecordBatch::project` [\#2690](https://github.com/apache/arrow-rs/issues/2690) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support sorting dictionary encoded primitive integer arrays [\#2679](https://github.com/apache/arrow-rs/issues/2679) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use BitIndexIterator in min\_max\_helper [\#2674](https://github.com/apache/arrow-rs/issues/2674) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support building comparator for dictionaries of primitive integer values [\#2672](https://github.com/apache/arrow-rs/issues/2672) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Change max/min string macro to generic helper function `min_max_helper` [\#2657](https://github.com/apache/arrow-rs/issues/2657) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variant of arithmetic scalar kernels [\#2651](https://github.com/apache/arrow-rs/issues/2651) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Compare dictionary with binary array [\#2644](https://github.com/apache/arrow-rs/issues/2644) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add overflow-checking variant for primitive arithmetic kernels [\#2642](https://github.com/apache/arrow-rs/issues/2642) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `downcast_primitive_array` in arithmetic kernels [\#2639](https://github.com/apache/arrow-rs/issues/2639) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support DictionaryArray in temporal kernels [\#2622](https://github.com/apache/arrow-rs/issues/2622) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Inline Generated Thift Code Into Parquet Crate [\#2502](https://github.com/apache/arrow-rs/issues/2502) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Fixed bugs:** -- Casting timestamp array to string should not ignore timezone [\#2607](https://github.com/apache/arrow-rs/issues/2607) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Ilike\_ut8\_scalar kernals have incorrect logic [\#2544](https://github.com/apache/arrow-rs/issues/2544) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Always validate the array data when creating array in IPC reader [\#2541](https://github.com/apache/arrow-rs/issues/2541) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Int96Converter Truncates Timestamps [\#2480](https://github.com/apache/arrow-rs/issues/2480) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Error Reading Page Index When Not Available [\#2434](https://github.com/apache/arrow-rs/issues/2434) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- `ParquetFileArrowReader::get_record_reader[_by_colum]` `batch_size` overallocates [\#2321](https://github.com/apache/arrow-rs/issues/2321) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] - -**Documentation updates:** - -- Document All Arrow Features in docs.rs [\#2633](https://github.com/apache/arrow-rs/issues/2633) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Escape contains patterns for utf8 like kernels [\#2745](https://github.com/apache/arrow-rs/issues/2745) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Float Array should not panic on `DivideByZero` in the `Divide` kernel [\#2719](https://github.com/apache/arrow-rs/issues/2719) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- DictionaryBuilders can Create Invalid DictionaryArrays [\#2684](https://github.com/apache/arrow-rs/issues/2684) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `arrow` crate does not build with `features = ["ffi"]` and `default_features = false`. [\#2670](https://github.com/apache/arrow-rs/issues/2670) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Invalid results with `RowSelector` having `row_count` of 0 [\#2669](https://github.com/apache/arrow-rs/issues/2669) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- clippy error: unresolved import `crate::array::layout` [\#2659](https://github.com/apache/arrow-rs/issues/2659) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Cast the numeric without the `CastOptions` [\#2648](https://github.com/apache/arrow-rs/issues/2648) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Explicitly define overflow behavior for primitive arithmetic kernels [\#2641](https://github.com/apache/arrow-rs/issues/2641) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- update the `flight.proto` and fix schema to SchemaResult [\#2571](https://github.com/apache/arrow-rs/issues/2571) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Panic when first data page is skipped using ColumnChunkData::Sparse [\#2543](https://github.com/apache/arrow-rs/issues/2543) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- `SchemaResult` in IPC deviates from other implementations [\#2445](https://github.com/apache/arrow-rs/issues/2445) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] **Closed issues:** -- Add support for CAST from `Interval(DayTime)` to `Timestamp(Nanosecond, None)` [\#2606](https://github.com/apache/arrow-rs/issues/2606) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Why do we check for null in TypedDictionaryArray value function [\#2564](https://github.com/apache/arrow-rs/issues/2564) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Add the `length` field for `Buffer` [\#2524](https://github.com/apache/arrow-rs/issues/2524) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Avoid large over allocate buffer in async reader [\#2512](https://github.com/apache/arrow-rs/issues/2512) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Rewriting Decimal Builders using `const_generic`. [\#2390](https://github.com/apache/arrow-rs/issues/2390) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Rewrite Decimal Array using `const_generic` [\#2384](https://github.com/apache/arrow-rs/issues/2384) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement collect for int values [\#2696](https://github.com/apache/arrow-rs/issues/2696) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Merged pull requests:** -- Add downcast macros \(\#2635\) [\#2636](https://github.com/apache/arrow-rs/pull/2636) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Document all arrow features in docs.rs \(\#2633\) [\#2634](https://github.com/apache/arrow-rs/pull/2634) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Document dyn\_cmp\_dict [\#2624](https://github.com/apache/arrow-rs/pull/2624) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support comparison between DictionaryArray and BooleanArray [\#2618](https://github.com/apache/arrow-rs/pull/2618) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Cast timestamp array to string array with timezone [\#2608](https://github.com/apache/arrow-rs/pull/2608) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support empty projection in CSV and JSON readers [\#2604](https://github.com/apache/arrow-rs/pull/2604) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- Make JSON support optional via a feature flag \(\#2300\) [\#2601](https://github.com/apache/arrow-rs/pull/2601) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Support SQL-compliant NaN ordering for DictionaryArray and non-DictionaryArray [\#2600](https://github.com/apache/arrow-rs/pull/2600) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Split out integration test plumbing \(\#2594\) \(\#2300\) [\#2598](https://github.com/apache/arrow-rs/pull/2598) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Refactor Binary Builder and String Builder Constructors [\#2592](https://github.com/apache/arrow-rs/pull/2592) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Dictionary like scalar kernels [\#2591](https://github.com/apache/arrow-rs/pull/2591) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Validate dictionary key in TypedDictionaryArray \(\#2578\) [\#2589](https://github.com/apache/arrow-rs/pull/2589) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Add max\_dyn and min\_dyn for max/min for dictionary array [\#2585](https://github.com/apache/arrow-rs/pull/2585) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Code cleanup of array value functions [\#2583](https://github.com/apache/arrow-rs/pull/2583) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Allow overriding of do\_get & export useful macro [\#2582](https://github.com/apache/arrow-rs/pull/2582) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) -- MINOR: Upgrade to pyo3 0.17 [\#2576](https://github.com/apache/arrow-rs/pull/2576) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([andygrove](https://github.com/andygrove)) -- Support SQL-compliant NaN behavior on eq\_dyn, neq\_dyn, lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn [\#2570](https://github.com/apache/arrow-rs/pull/2570) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add sum\_dyn to calculate sum for dictionary array [\#2566](https://github.com/apache/arrow-rs/pull/2566) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- struct UnionBuilder will create child buffers with capacity [\#2560](https://github.com/apache/arrow-rs/pull/2560) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kastolars](https://github.com/kastolars)) -- Don't panic on RleValueEncoder::flush\_buffer if empty \(\#2558\) [\#2559](https://github.com/apache/arrow-rs/pull/2559) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add the `length` field for Buffer and use more `Buffer` in IPC reader to avoid memory copy. [\#2557](https://github.com/apache/arrow-rs/pull/2557) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([HaoYang670](https://github.com/HaoYang670)) -- Add test for float nan comparison [\#2555](https://github.com/apache/arrow-rs/pull/2555) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Compare dictionary array with string array [\#2549](https://github.com/apache/arrow-rs/pull/2549) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Always validate the array data \(except the `Decimal`\) when creating array in IPC reader [\#2547](https://github.com/apache/arrow-rs/pull/2547) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- MINOR: Fix test\_row\_type\_validation test [\#2546](https://github.com/apache/arrow-rs/pull/2546) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Fix ilike\_utf8\_scalar kernals [\#2545](https://github.com/apache/arrow-rs/pull/2545) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- fix typo [\#2540](https://github.com/apache/arrow-rs/pull/2540) ([00Masato](https://github.com/00Masato)) -- Compare dictionary array and primitive array in lt\_dyn, lt\_eq\_dyn, gt\_dyn, gt\_eq\_dyn kernels [\#2539](https://github.com/apache/arrow-rs/pull/2539) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- \[MINOR\]Avoid large over allocate buffer in async reader [\#2537](https://github.com/apache/arrow-rs/pull/2537) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Compare dictionary with primitive array in `eq_dyn` and `neq_dyn` [\#2533](https://github.com/apache/arrow-rs/pull/2533) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add iterator for FixedSizeBinaryArray [\#2531](https://github.com/apache/arrow-rs/pull/2531) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- add bench: decimal with byte array and fixed length byte array [\#2529](https://github.com/apache/arrow-rs/pull/2529) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) -- Add FixedLengthByteArrayReader Remove ComplexObjectArrayReader [\#2528](https://github.com/apache/arrow-rs/pull/2528) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Split out byte array decoders \(\#2318\) [\#2527](https://github.com/apache/arrow-rs/pull/2527) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Use offset index in ParquetRecordBatchStream [\#2526](https://github.com/apache/arrow-rs/pull/2526) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- Clean the `create_array` in IPC reader. [\#2525](https://github.com/apache/arrow-rs/pull/2525) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Remove DecimalByteArrayConvert \(\#2480\) [\#2522](https://github.com/apache/arrow-rs/pull/2522) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Improve performance of `%pat%` \(\>3x speedup\) [\#2521](https://github.com/apache/arrow-rs/pull/2521) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- remove len field from MapBuilder [\#2520](https://github.com/apache/arrow-rs/pull/2520) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([psvri](https://github.com/psvri)) -- Replace macro with TypedDictionaryArray in comparison kernels [\#2514](https://github.com/apache/arrow-rs/pull/2514) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Avoid large over allocate buffer in sync reader [\#2511](https://github.com/apache/arrow-rs/pull/2511) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Avoid useless memory copies in IPC reader. [\#2510](https://github.com/apache/arrow-rs/pull/2510) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Refactor boolean kernels to use same codebase [\#2508](https://github.com/apache/arrow-rs/pull/2508) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Remove Int96Converter \(\#2480\) [\#2481](https://github.com/apache/arrow-rs/pull/2481) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Speedup string equal/not equal to empty string, cleanup like/ilike kernels, fix escape bug [\#2743](https://github.com/apache/arrow-rs/pull/2743) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Partially flatten arrow-buffer [\#2737](https://github.com/apache/arrow-rs/pull/2737) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Automate updates to `CHANGELOG-old.md` [\#2732](https://github.com/apache/arrow-rs/pull/2732) ([iajoiner](https://github.com/iajoiner)) +- Update read parquet example in parquet/arrow home [\#2730](https://github.com/apache/arrow-rs/pull/2730) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([datapythonista](https://github.com/datapythonista)) +- Better construction of RecordBatchOptions [\#2729](https://github.com/apache/arrow-rs/pull/2729) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([askoa](https://github.com/askoa)) +- benchmark: bitwise operation [\#2718](https://github.com/apache/arrow-rs/pull/2718) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Update `try_binary` and `checked_ops`, and remove `math_checked_op` [\#2717](https://github.com/apache/arrow-rs/pull/2717) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Support bitwise op in kernel: or,xor,not [\#2716](https://github.com/apache/arrow-rs/pull/2716) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Add overflow-checking variants of arithmetic scalar dyn kernels [\#2713](https://github.com/apache/arrow-rs/pull/2713) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add divide\_opt kernel which produce null values on division by zero error [\#2710](https://github.com/apache/arrow-rs/pull/2710) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add DataType::is\_nested\(\) [\#2707](https://github.com/apache/arrow-rs/pull/2707) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([kfastov](https://github.com/kfastov)) +- Update criterion requirement from 0.3 to 0.4 [\#2706](https://github.com/apache/arrow-rs/pull/2706) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dependabot[bot]](https://github.com/apps/dependabot)) +- Support bitwise and operation in the kernel [\#2703](https://github.com/apache/arrow-rs/pull/2703) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Add support of sorting dictionary of other primitive arrays [\#2701](https://github.com/apache/arrow-rs/pull/2701) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Clarify docs of binary and string builders [\#2699](https://github.com/apache/arrow-rs/pull/2699) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([datapythonista](https://github.com/datapythonista)) +- Sort indices of dictionary string values [\#2698](https://github.com/apache/arrow-rs/pull/2698) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add support for empty projection in RecordBatch::project [\#2691](https://github.com/apache/arrow-rs/pull/2691) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Temporarily disable Golang integration tests re-enable JS [\#2689](https://github.com/apache/arrow-rs/pull/2689) ([tustvold](https://github.com/tustvold)) +- Verify valid UTF-8 when converting byte array \(\#2205\) [\#2686](https://github.com/apache/arrow-rs/pull/2686) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support sorting dictionary encoded primitive integer arrays [\#2680](https://github.com/apache/arrow-rs/pull/2680) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Skip RowSelectors with zero rows [\#2678](https://github.com/apache/arrow-rs/pull/2678) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([askoa](https://github.com/askoa)) +- Faster Null Path Selection in ArrayData Equality [\#2676](https://github.com/apache/arrow-rs/pull/2676) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([dhruv9vats](https://github.com/dhruv9vats)) +- Use BitIndexIterator in min\_max\_helper [\#2675](https://github.com/apache/arrow-rs/pull/2675) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support building comparator for dictionaries of primitive integer values [\#2673](https://github.com/apache/arrow-rs/pull/2673) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- json feature always requires base64 feature [\#2668](https://github.com/apache/arrow-rs/pull/2668) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([eagletmt](https://github.com/eagletmt)) +- Add try\_unary, binary, try\_binary kernels ~90% faster [\#2666](https://github.com/apache/arrow-rs/pull/2666) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Use downcast\_dictionary\_array in unary\_dyn [\#2663](https://github.com/apache/arrow-rs/pull/2663) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- optimize the `numeric_cast_with_error` [\#2661](https://github.com/apache/arrow-rs/pull/2661) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- ffi feature also requires layout [\#2660](https://github.com/apache/arrow-rs/pull/2660) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Change max/min string macro to generic helper function min\_max\_helper [\#2658](https://github.com/apache/arrow-rs/pull/2658) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix flaky test `test_fuzz_async_reader_selection` [\#2656](https://github.com/apache/arrow-rs/pull/2656) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- MINOR: Ignore flaky test test\_fuzz\_async\_reader\_selection [\#2655](https://github.com/apache/arrow-rs/pull/2655) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([viirya](https://github.com/viirya)) +- MutableBuffer::typed\_data - shared ref access to the typed slice [\#2652](https://github.com/apache/arrow-rs/pull/2652) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([medwards](https://github.com/medwards)) +- Overflow-checking variant of arithmetic scalar kernels [\#2650](https://github.com/apache/arrow-rs/pull/2650) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- support `CastOption` for casting numeric [\#2649](https://github.com/apache/arrow-rs/pull/2649) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Help LLVM vectorize comparison kernel ~50-80% faster [\#2646](https://github.com/apache/arrow-rs/pull/2646) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Support comparison between dictionary array and binary array [\#2645](https://github.com/apache/arrow-rs/pull/2645) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Use `downcast_primitive_array` in arithmetic kernels [\#2640](https://github.com/apache/arrow-rs/pull/2640) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fully qualifying parquet items [\#2638](https://github.com/apache/arrow-rs/pull/2638) ([dingxiangfei2009](https://github.com/dingxiangfei2009)) +- Support DictionaryArray in temporal kernels [\#2623](https://github.com/apache/arrow-rs/pull/2623) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Comparable Row Format [\#2593](https://github.com/apache/arrow-rs/pull/2593) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix bug in page skipping [\#2552](https://github.com/apache/arrow-rs/pull/2552) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index ecf02625c9d..a6fb8751c2d 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "22.0.0" +version = "23.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,7 +27,7 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "22.0.0", default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "23.0.0", default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false } tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index 9e9a18ad478..e01809f3813 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "22.0.0" +arrow-flight = "23.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 9aef5a0570a..38bbcf9e8bc 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "22.0.0" +version = "23.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "22.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "23.0.0", features = ["pyarrow"] } pyo3 = { version = "0.17", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 1580856dfc0..f1918fccd1f 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow" -version = "22.0.0" +version = "23.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow/README.md b/arrow/README.md index 7a95df0f225..a1c0e6279a5 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -35,7 +35,7 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `22.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `23.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. ## Feature Flags @@ -61,7 +61,7 @@ The [Apache Arrow Status](https://arrow.apache.org/docs/status.html) page lists ## Safety -Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/22.0.01/18/soundness-pledge.html). Specifically: +Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here](https://raphlinus.github.io/rust/23.0.01/18/soundness-pledge.html). Specifically: > The intent of this crate is to be free of soundness bugs. The developers will do their best to avoid them, and welcome help in analyzing and fixing them diff --git a/dev/release/README.md b/dev/release/README.md index 3783301e9be..48748eccbe8 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -78,7 +78,7 @@ CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh git commit -a -m 'Create changelog' # update versions -sed -i '' -e 's/14.0.0/22.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/23.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' ``` diff --git a/dev/release/label_issues.py b/dev/release/label_issues.py new file mode 100755 index 00000000000..b004b7fa7f8 --- /dev/null +++ b/dev/release/label_issues.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python + +############################################################################## +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +############################################################################## + +# Python script to add labels to github issues from the PRs that closed them +# +# Required setup: +# $ pip install PyGithub +# +# ARROW_GITHUB_API_TOKEN needs to be set to your github token +from github import Github +import os +import re + + + +# get all cross referenced issues from the named issue +# (aka linked PRs) +# issue = arrow_repo.get_issue(issue_number) +def get_cross_referenced_issues(issue): + all_issues = set() + for timeline_item in issue.get_timeline(): + if timeline_item.event == 'cross-referenced' and timeline_item.source.type == 'issue': + all_issues.add(timeline_item.source.issue) + + # convert to list + return [i for i in all_issues] + + +# labels not to transfer +BLACKLIST_LABELS = {'development-process', 'api-change'} + +# Adds labels to the specified issue with the labels from linked pull requests +def relabel_issue(arrow_repo, issue_number): + #print(issue_number, 'fetching issue') + issue = arrow_repo.get_issue(issue_number) + print('considering issue', issue.html_url) + linked_issues = get_cross_referenced_issues(issue) + #print(' ', 'cross referenced issues:', linked_issues) + + # Figure out what labels need to be added, if any + existing_labels = set() + for label in issue.labels: + existing_labels.add(label.name) + + # find all labels to add + for linked_issue in linked_issues: + if linked_issue.pull_request is None: + print(' ', 'not pull request, skipping', linked_issue.html_url) + continue + + if linked_issue.repository.name != 'arrow-rs': + print(' ', 'not in arrow-rs, skipping', linked_issue.html_url) + continue + + print(' ', 'finding labels for linked pr', linked_issue.html_url) + linked_labels = set() + for label in linked_issue.labels: + linked_labels.add(label.name) + #print(' ', 'existing labels:', existing_labels) + + labels_to_add = linked_labels.difference(existing_labels) + + # remove any blacklist labels, if any + for l in BLACKLIST_LABELS: + labels_to_add.discard(l) + + if len(labels_to_add) > 0: + print(' ', 'adding labels: ', labels_to_add, 'to', issue.number) + for label in labels_to_add: + issue.add_to_labels(label) + print(' ', 'added', label) + existing_labels.add(label) + + # leave a note about what updated these labels + issue.create_comment('`label_issue.py` automatically added labels {} from #{}'.format(labels_to_add, linked_issue.number)) + + +# what section headings in the CHANGELOG.md file contain closed issues that may need relabeling +ISSUE_SECTION_NAMES = ['Closed issues:', 'Fixed bugs:', 'Implemented enhancements:'] + +# find all possible issues / bugs by scraping CHANGELOG.md +# +# TODO: Find all tickets merged since this tag +# The compare api can find all commits since that tag +# I could not find a good way in the github API to find the PRs connected to a commit +#since_tag = '22.0.0' + +def find_issues_from_changelog(): + script_dir = os.path.dirname(os.path.realpath(__file__)) + path = os.path.join(script_dir, '..', '..', 'CHANGELOG.md') + + issues = set() + + # Flag that + in_issue_section = False + + with open(path, 'r') as f: + for line in f: + #print('line: ', line) + line = line.strip() + if line.startswith('**'): + section_name = line.replace('**', '') + if section_name in ISSUE_SECTION_NAMES: + #print(' ', 'is issue section', section_name) + in_issue_section = True + else: + #print(' ', 'is not issue section', section_name) + in_issue_section = False + + if in_issue_section: + match = re.search('#([\d]+)', line) + if match is not None: + #print(' ', 'reference', match.group(1)) + issues.add(match.group(1)) + + # Convert to list of number + return sorted([int(i) for i in issues]) + + +if __name__ == '__main__': + print('Attempting to label github issues from their corresponding PRs') + + issues = find_issues_from_changelog() + print('Issues found in CHANGELOG: ', issues) + + github_token = os.environ.get("ARROW_GITHUB_API_TOKEN") + + print('logging into GITHUB...') + github = Github(github_token) + + print('getting github repo...') + arrow_repo = github.get_repo('apache/arrow-rs') + + for issue in issues: + relabel_issue(arrow_repo, issue) diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index b9f6cf81855..e45b812dd6a 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests" -version = "22.0.0" +version = "23.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a2d11eb5862..9b95868f3fc 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "22.0.0" +version = "23.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -41,7 +41,7 @@ zstd = { version = "0.11.1", optional = true, default-features = false } chrono = { version = "0.4", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -arrow = { path = "../arrow", version = "22.0.0", optional = true, default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "23.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "3", default-features = false, features = ["std", "derive", "env"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } @@ -61,7 +61,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "22.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } +arrow = { path = "../arrow", version = "23.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint", "json"] } [package.metadata.docs.rs] all-features = true diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index e32ee1ace5b..54aa6d52f1e 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "22.0.0" +version = "23.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", default-features = false } -parquet = { path = "../parquet", version = "22.0.0" } +parquet = { path = "../parquet", version = "23.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index d3d7f56ebf6..4aae73dfc2e 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "22.0.0" -parquet_derive = "22.0.0" +parquet = "23.0.0" +parquet_derive = "23.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index 4b814c4c088..dd8486da2ca 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "22.0.0" +version = "23.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "22.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "22.0.0", default-features = false } +parquet = { path = "../parquet", version = "23.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "23.0.0", default-features = false } chrono = { version="0.4.19", default-features = false, features = [ "clock" ] } From 5a55406cf24171600a143a83a95046c7513fd92c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Sep 2022 12:40:37 -0400 Subject: [PATCH 10/16] update new `arrow-buffer` crate to 23.0.0 (#2748) * update new `arrow-buffer` crate to 23.0.0 * Update dependency --- arrow-buffer/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml index 87019111efc..c1bcd9f6306 100644 --- a/arrow-buffer/Cargo.toml +++ b/arrow-buffer/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "arrow-buffer" -version = "22.0.0" +version = "23.0.0" description = "Buffer abstractions for Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index f1918fccd1f..7391ffcf827 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -44,7 +44,7 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] -arrow-buffer = { path = "../arrow-buffer", version = "22.0.0" } +arrow-buffer = { path = "../arrow-buffer", version = "23.0.0" } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } From ca00b671500b693f8c5e07ac4ea600269adfa2b6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Sep 2022 19:42:20 -0400 Subject: [PATCH 11/16] Fix `verify_release_candidate.sh` for new arrow subcrates (#2752) --- dev/release/verify-release-candidate.sh | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index cf8050c1c9f..98c582c2e17 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -116,21 +116,16 @@ test_source_distribution() { export ARROW_TEST_DATA=$PWD/arrow-testing-data/data export PARQUET_TEST_DATA=$PWD/parquet-testing-data/data - # use local modules because we don't publish modules to crates.io yet - sed \ - -i.bak \ - -E \ - -e 's/^arrow = "([^"]*)"/arrow = { version = "\1", path = "..\/arrow" }/g' \ - -e 's/^parquet = "([^"]*)"/parquet = { version = "\1", path = "..\/parquet" }/g' \ - */Cargo.toml - (cd arrow && cargo build && cargo test) (cd arrow-flight && cargo build && cargo test) (cd parquet && cargo build && cargo test) (cd parquet_derive && cargo build && cargo test) - # verify that the crates can be published to crates.io - pushd arrow + # verify that the leaf crates can be published to crates.io + # we can't verify crates that depend on others + # (because the others haven't yet been published to crates.io) + + pushd arrow-buffer cargo publish --dry-run popd From 46fcb0c93c7b6e2067ff6a5b5bc0b0108ca3c2ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 17 Sep 2022 05:58:09 +0200 Subject: [PATCH 12/16] Speed up checked kernels for non-null data (~1.4-5x faster) (#2749) * Speed up checked kernels * Fast path for non-null * Move some code --- arrow/src/compute/kernels/arity.rs | 31 ++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 5060234c71b..216e3bfcac3 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -106,15 +106,26 @@ where let len = array.len(); let null_count = array.null_count(); - let mut buffer = BufferBuilder::::new(len); - buffer.append_n_zeroed(array.len()); - let slice = buffer.as_slice_mut(); + if null_count == 0 { + let values = array.values().iter().map(|v| op(*v)); + // JUSTIFICATION + // Benefit + // ~60% speedup + // Soundness + // `values` is an iterator with a known size because arrays are sized. + let buffer = unsafe { Buffer::try_from_trusted_len_iter(values)? }; + return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); + } let null_buffer = array .data_ref() .null_buffer() .map(|b| b.bit_slice(array.offset(), array.len())); + let mut buffer = BufferBuilder::::new(len); + buffer.append_n_zeroed(array.len()); + let slice = buffer.as_slice_mut(); + try_for_each_valid_idx(array.len(), 0, null_count, null_buffer.as_deref(), |idx| { unsafe { *slice.get_unchecked_mut(idx) = op(array.value_unchecked(idx))? }; Ok::<_, ArrowError>(()) @@ -284,9 +295,21 @@ where if a.is_empty() { return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } - let len = a.len(); + + if a.null_count() == 0 && b.null_count() == 0 { + let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); + let buffer = unsafe { Buffer::try_from_trusted_len_iter(values) }?; + // JUSTIFICATION + // Benefit + // ~75% speedup + // Soundness + // `values` is an iterator with a known size from a PrimitiveArray + return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); + } + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_count = null_buffer .as_ref() .map(|x| len - x.count_set_bits()) From 5e83ef9cc7e426171f4cb9451fa004c55c7c95be Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 17 Sep 2022 01:04:02 -0700 Subject: [PATCH 13/16] Add value type check in try_unary_dict (#2755) --- arrow/src/compute/kernels/arity.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 216e3bfcac3..12cf9721f97 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -156,6 +156,13 @@ where T: ArrowPrimitiveType, F: Fn(T::Native) -> Result, { + if array.value_type() != T::DATA_TYPE { + return Err(ArrowError::CastError(format!( + "Cannot perform the unary operation on dictionary array of value type {}", + array.value_type() + ))); + } + let dict_values = array.values().as_any().downcast_ref().unwrap(); let values = try_unary::(dict_values, op)?.into_data(); let data = array.data().clone().into_builder().child_data(vec![values]); From 3bf6eb98ceb3962e1d9419da6dc93e609f7893e6 Mon Sep 17 00:00:00 2001 From: aksharau Date: Mon, 19 Sep 2022 11:18:23 +0530 Subject: [PATCH 14/16] Fix: Issue 2721 : binary function should not panic but return error when array lengths are unequal (#2750) --- arrow/src/compute/kernels/arithmetic.rs | 14 +++------- arrow/src/compute/kernels/arity.rs | 36 +++++++++++++++---------- arrow/src/compute/kernels/bitwise.rs | 9 ++----- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index 7b91a261c7e..b1a62ccfd6a 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -69,13 +69,7 @@ where RT: ArrowNumericType, F: Fn(LT::Native, RT::Native) -> LT::Native, { - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform math operation on arrays of different length".to_string(), - )); - } - - Ok(binary(left, right, op)) + binary(left, right, op) } /// Helper function for operations where a valid `0` on the right array should @@ -1128,13 +1122,13 @@ where T: ArrowNumericType, T::Native: ArrowNativeTypeOp + Zero + One, { - Ok(binary_opt(left, right, |a, b| { + binary_opt(left, right, |a, b| { if b.is_zero() { None } else { Some(a.div_wrapping(b)) } - })) + }) } /// Perform `left / right` operation on two arrays. If either left or right value is null @@ -1670,7 +1664,7 @@ mod tests { let b = Int32Array::from(vec![6, 7, 8]); let e = add(&a, &b).expect_err("should have failed due to different lengths"); assert_eq!( - "ComputeError(\"Cannot perform math operation on arrays of different length\")", + "ComputeError(\"Cannot perform binary operation on arrays of different length\")", format!("{:?}", e) ); } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 12cf9721f97..2347502f96e 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -235,25 +235,29 @@ where /// especially when the operation can be vectorised, however, requires `op` to be infallible /// for all possible values of its inputs /// -/// # Panic +/// # Error /// -/// Panics if the arrays have different lengths +/// This function gives error if the arrays have different lengths pub fn binary( a: &PrimitiveArray, b: &PrimitiveArray, op: F, -) -> PrimitiveArray +) -> Result> where A: ArrowPrimitiveType, B: ArrowPrimitiveType, O: ArrowPrimitiveType, F: Fn(A::Native, B::Native) -> O::Native, { - assert_eq!(a.len(), b.len()); + if a.len() != b.len() { + return Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + )); + } let len = a.len(); if a.is_empty() { - return PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)); + return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); @@ -270,7 +274,7 @@ where // `values` is an iterator with a known size from a PrimitiveArray let buffer = unsafe { Buffer::from_trusted_len_iter(values) }; - unsafe { build_primitive_array(len, buffer, null_count, null_buffer) } + Ok(unsafe { build_primitive_array(len, buffer, null_count, null_buffer) }) } /// Applies the provided fallible binary operation across `a` and `b`, returning any error, @@ -344,32 +348,36 @@ where /// /// The function is only evaluated for non-null indices /// -/// # Panic +/// # Error /// -/// Panics if the arrays have different lengths +/// This function gives error if the arrays have different lengths pub(crate) fn binary_opt( a: &PrimitiveArray, b: &PrimitiveArray, op: F, -) -> PrimitiveArray +) -> Result> where A: ArrowPrimitiveType, B: ArrowPrimitiveType, O: ArrowPrimitiveType, F: Fn(A::Native, B::Native) -> Option, { - assert_eq!(a.len(), b.len()); + if a.len() != b.len() { + return Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + )); + } if a.is_empty() { - return PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)); + return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE))); } if a.null_count() == 0 && b.null_count() == 0 { - a.values() + Ok(a.values() .iter() .zip(b.values().iter()) .map(|(a, b)| op(*a, *b)) - .collect() + .collect()) } else { let iter_a = ArrayIter::new(a); let iter_b = ArrayIter::new(b); @@ -386,7 +394,7 @@ where } }); - values.collect() + Ok(values.collect()) } } diff --git a/arrow/src/compute/kernels/bitwise.rs b/arrow/src/compute/kernels/bitwise.rs index 2f3c9e490f4..0b877b32648 100644 --- a/arrow/src/compute/kernels/bitwise.rs +++ b/arrow/src/compute/kernels/bitwise.rs @@ -18,7 +18,7 @@ use crate::array::PrimitiveArray; use crate::compute::{binary, unary}; use crate::datatypes::ArrowNumericType; -use crate::error::{ArrowError, Result}; +use crate::error::Result; use std::ops::{BitAnd, BitOr, BitXor, Not}; // The helper function for bitwise operation with two array @@ -31,12 +31,7 @@ where T: ArrowNumericType, F: Fn(T::Native, T::Native) -> T::Native, { - if left.len() != right.len() { - return Err(ArrowError::ComputeError( - "Cannot perform bitwise operation on arrays of different length".to_string(), - )); - } - Ok(binary(left, right, op)) + binary(left, right, op) } /// Perform `left & right` operation on two arrays. If either left or right value is null From 9599178c953a7980ec1841d06e2232a671b5cbb3 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 20 Sep 2022 03:30:37 -0700 Subject: [PATCH 15/16] Add overflow-checking variants of arithmetic dyn kernels (#2740) * Init * More * More * Add tests * Fix clippy * Remove macro * Update doc * Fix clippy * Remove length check * Tweak try_binary to coordinate latest optimization * Fix clippy * Use for loop * Split non-null variant into never inline function * Add value type check * Multiply by get_byte_width of output type. --- arrow/src/compute/kernels/arithmetic.rs | 470 +++++++++++++++++++++--- arrow/src/compute/kernels/arity.rs | 82 +++-- 2 files changed, 466 insertions(+), 86 deletions(-) diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index b1a62ccfd6a..aa6c8cd6694 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -68,10 +68,30 @@ where LT: ArrowNumericType, RT: ArrowNumericType, F: Fn(LT::Native, RT::Native) -> LT::Native, + LT::Native: ArrowNativeTypeOp, + RT::Native: ArrowNativeTypeOp, { binary(left, right, op) } +/// This is similar to `math_op` as it performs given operation between two input primitive arrays. +/// But the given operation can return `Err` if overflow is detected. For the case, this function +/// returns an `Err`. +fn math_checked_op( + left: &PrimitiveArray, + right: &PrimitiveArray, + op: F, +) -> Result> +where + LT: ArrowNumericType, + RT: ArrowNumericType, + F: Fn(LT::Native, RT::Native) -> Result, + LT::Native: ArrowNativeTypeOp, + RT::Native: ArrowNativeTypeOp, +{ + try_binary(left, right, op) +} + /// Helper function for operations where a valid `0` on the right array should /// result in an [ArrowError::DivideByZero], namely the division and modulo operations /// @@ -516,57 +536,64 @@ macro_rules! typed_dict_math_op { }}; } -/// Helper function to perform math lambda function on values from two dictionary arrays, this -/// version does not attempt to use SIMD explicitly (though the compiler may auto vectorize) -macro_rules! math_dict_op { - ($left: expr, $right:expr, $op:expr, $value_ty:ty) => {{ - if $left.len() != $right.len() { - return Err(ArrowError::ComputeError(format!( - "Cannot perform operation on arrays of different length ({}, {})", - $left.len(), - $right.len() - ))); - } +/// Perform given operation on two `DictionaryArray`s. +/// Returns an error if the two arrays have different value type +fn math_op_dict( + left: &DictionaryArray, + right: &DictionaryArray, + op: F, +) -> Result> +where + K: ArrowNumericType, + T: ArrowNumericType, + F: Fn(T::Native, T::Native) -> T::Native, + T::Native: ArrowNativeTypeOp, +{ + if left.len() != right.len() { + return Err(ArrowError::ComputeError(format!( + "Cannot perform operation on arrays of different length ({}, {})", + left.len(), + right.len() + ))); + } - // Safety justification: Since the inputs are valid Arrow arrays, all values are - // valid indexes into the dictionary (which is verified during construction) - - let left_iter = unsafe { - $left - .values() - .as_any() - .downcast_ref::<$value_ty>() - .unwrap() - .take_iter_unchecked($left.keys_iter()) - }; - - let right_iter = unsafe { - $right - .values() - .as_any() - .downcast_ref::<$value_ty>() - .unwrap() - .take_iter_unchecked($right.keys_iter()) - }; - - let result = left_iter - .zip(right_iter) - .map(|(left_value, right_value)| { - if let (Some(left), Some(right)) = (left_value, right_value) { - Some($op(left, right)) - } else { - None - } - }) - .collect(); + // Safety justification: Since the inputs are valid Arrow arrays, all values are + // valid indexes into the dictionary (which is verified during construction) - Ok(result) - }}; + let left_iter = unsafe { + left.values() + .as_any() + .downcast_ref::>() + .unwrap() + .take_iter_unchecked(left.keys_iter()) + }; + + let right_iter = unsafe { + right + .values() + .as_any() + .downcast_ref::>() + .unwrap() + .take_iter_unchecked(right.keys_iter()) + }; + + let result = left_iter + .zip(right_iter) + .map(|(left_value, right_value)| { + if let (Some(left), Some(right)) = (left_value, right_value) { + Some(op(left, right)) + } else { + None + } + }) + .collect(); + + Ok(result) } /// Perform given operation on two `DictionaryArray`s. /// Returns an error if the two arrays have different value type -fn math_op_dict( +fn math_checked_op_dict( left: &DictionaryArray, right: &DictionaryArray, op: F, @@ -574,9 +601,21 @@ fn math_op_dict( where K: ArrowNumericType, T: ArrowNumericType, - F: Fn(T::Native, T::Native) -> T::Native, + F: Fn(T::Native, T::Native) -> Result, + T::Native: ArrowNativeTypeOp, { - math_dict_op!(left, right, op, PrimitiveArray) + // left and right's value types are supposed to be same as guaranteed by the caller macro now. + if left.value_type() != T::DATA_TYPE { + return Err(ArrowError::NotYetImplemented(format!( + "Cannot perform provided operation on dictionary array of value type {}", + left.value_type() + ))); + } + + let left = left.downcast_dict::>().unwrap(); + let right = right.downcast_dict::>().unwrap(); + + try_binary(left, right, op) } /// Helper function for operations where a valid `0` on the right array should @@ -672,10 +711,13 @@ where /// Perform `left + right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `add_dyn_checked` instead. pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a + b, math_op_dict) + typed_dict_math_op!(left, right, |a, b| a.add_wrapping(b), math_op_dict) } DataType::Date32 => { let l = as_primitive_array::(left); @@ -728,7 +770,84 @@ pub fn add_dyn(left: &dyn Array, right: &dyn Array) -> Result { _ => { downcast_primitive_array!( (left, right) => { - math_op(left, right, |a, b| a + b).map(|a| Arc::new(a) as ArrayRef) + math_op(left, right, |a, b| a.add_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + +/// Perform `left + right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `add_dyn` instead. +pub fn add_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| a.add_checked(b), + math_checked_op_dict + ) + } + DataType::Date32 => { + let l = as_primitive_array::(left); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date32Type::add_year_months)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date32Type::add_day_time)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date32Type::add_month_day_nano)?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + DataType::Date64 => { + let l = as_primitive_array::(left); + match right.data_type() { + DataType::Interval(IntervalUnit::YearMonth) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date64Type::add_year_months)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::DayTime) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date64Type::add_day_time)?; + Ok(Arc::new(res)) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let r = as_primitive_array::(right); + let res = math_op(l, r, Date64Type::add_month_day_nano)?; + Ok(Arc::new(res)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot perform arithmetic operation between array of type {} and array of type {}", + left.data_type(), right.data_type() + ))), + } + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_op(left, right, |a, b| a.add_checked(b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -839,15 +958,47 @@ where /// Perform `left - right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `subtract_dyn_checked` instead. pub fn subtract_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a - b, math_op_dict) + typed_dict_math_op!(left, right, |a, b| a.sub_wrapping(b), math_op_dict) } _ => { downcast_primitive_array!( (left, right) => { - math_op(left, right, |a, b| a - b).map(|a| Arc::new(a) as ArrayRef) + math_op(left, right, |a, b| a.sub_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + +/// Perform `left - right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `subtract_dyn` instead. +pub fn subtract_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| a.sub_checked(b), + math_checked_op_dict + ) + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_op(left, right, |a, b| a.sub_checked(b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -977,15 +1128,47 @@ where /// Perform `left * right` operation on two arrays. If either left or right value is null /// then the result is also null. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `multiply_dyn_checked` instead. pub fn multiply_dyn(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { - typed_dict_math_op!(left, right, |a, b| a * b, math_op_dict) + typed_dict_math_op!(left, right, |a, b| a.mul_wrapping(b), math_op_dict) } _ => { downcast_primitive_array!( (left, right) => { - math_op(left, right, |a, b| a * b).map(|a| Arc::new(a) as ArrayRef) + math_op(left, right, |a, b| a.mul_wrapping(b)).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + +/// Perform `left * right` operation on two arrays. If either left or right value is null +/// then the result is also null. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `multiply_dyn` instead. +pub fn multiply_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| a.mul_checked(b), + math_checked_op_dict + ) + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_op(left, right, |a, b| a.mul_checked(b)).map(|a| Arc::new(a) as ArrayRef) } _ => Err(ArrowError::CastError(format!( "Unsupported data type {}, {}", @@ -1134,7 +1317,52 @@ where /// Perform `left / right` operation on two arrays. If either left or right value is null /// then the result is also null. If any right hand value is zero then the result of this /// operation will be `Err(ArrowError::DivideByZero)`. +/// +/// This doesn't detect overflow. Once overflowing, the result will wrap around. +/// For an overflow-checking variant, use `divide_dyn_checked` instead. pub fn divide_dyn(left: &dyn Array, right: &dyn Array) -> Result { + match left.data_type() { + DataType::Dictionary(_, _) => { + typed_dict_math_op!( + left, + right, + |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.div_wrapping(b)) + } + }, + math_divide_checked_op_dict + ) + } + _ => { + downcast_primitive_array!( + (left, right) => { + math_checked_divide_op(left, right, |a, b| { + if b.is_zero() { + Err(ArrowError::DivideByZero) + } else { + Ok(a.div_wrapping(b)) + } + }).map(|a| Arc::new(a) as ArrayRef) + } + _ => Err(ArrowError::CastError(format!( + "Unsupported data type {}, {}", + left.data_type(), right.data_type() + ))) + ) + } + } +} + +/// Perform `left / right` operation on two arrays. If either left or right value is null +/// then the result is also null. If any right hand value is zero then the result of this +/// operation will be `Err(ArrowError::DivideByZero)`. +/// +/// This detects overflow and returns an `Err` for that. For an non-overflow-checking variant, +/// use `divide_dyn` instead. +pub fn divide_dyn_checked(left: &dyn Array, right: &dyn Array) -> Result { match left.data_type() { DataType::Dictionary(_, _) => { typed_dict_math_op!( @@ -2357,4 +2585,140 @@ mod tests { let expected = Int32Array::from(vec![None]); assert_eq!(expected, overflow.unwrap()); } + + #[test] + fn test_primitive_add_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + let b = Int32Array::from(vec![1, 1]); + + let wrapped = add_dyn(&a, &b).unwrap(); + let expected = + Arc::new(Int32Array::from(vec![-2147483648, -2147483647])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = add_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_dictionary_add_dyn_wrapping_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(2, 2); + builder.append(i32::MAX).unwrap(); + builder.append(i32::MIN).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(2, 2); + builder.append(1).unwrap(); + builder.append(1).unwrap(); + let b = builder.finish(); + + let wrapped = add_dyn(&a, &b).unwrap(); + let expected = + Arc::new(Int32Array::from(vec![-2147483648, -2147483647])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = add_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_subtract_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![-2]); + let b = Int32Array::from(vec![i32::MAX]); + + let wrapped = subtract_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![i32::MAX])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = subtract_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_dictionary_subtract_dyn_wrapping_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(-2).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(i32::MAX).unwrap(); + let b = builder.finish(); + + let wrapped = subtract_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![i32::MAX])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = subtract_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_mul_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![10]); + let b = Int32Array::from(vec![i32::MAX]); + + let wrapped = multiply_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-10])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = multiply_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_dictionary_mul_dyn_wrapping_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(10).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(i32::MAX).unwrap(); + let b = builder.finish(); + + let wrapped = multiply_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-10])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = multiply_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_primitive_div_dyn_wrapping_overflow() { + let a = Int32Array::from(vec![i32::MIN]); + let b = Int32Array::from(vec![-1]); + + let wrapped = divide_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-2147483648])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = divide_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } + + #[test] + fn test_dictionary_div_dyn_wrapping_overflow() { + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(i32::MIN).unwrap(); + let a = builder.finish(); + + let mut builder = + PrimitiveDictionaryBuilder::::with_capacity(1, 1); + builder.append(-1).unwrap(); + let b = builder.finish(); + + let wrapped = divide_dyn(&a, &b).unwrap(); + let expected = Arc::new(Int32Array::from(vec![-2147483648])) as ArrayRef; + assert_eq!(&expected, &wrapped); + + let overflow = divide_dyn_checked(&a, &b); + overflow.expect_err("overflow should be detected"); + } } diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 2347502f96e..bf10289683f 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -18,7 +18,8 @@ //! Defines kernels suitable to perform operations to primitive arrays. use crate::array::{ - Array, ArrayData, ArrayIter, ArrayRef, BufferBuilder, DictionaryArray, PrimitiveArray, + Array, ArrayAccessor, ArrayData, ArrayIter, ArrayRef, BufferBuilder, DictionaryArray, + PrimitiveArray, }; use crate::buffer::Buffer; use crate::compute::util::combine_option_bitmap; @@ -26,6 +27,7 @@ use crate::datatypes::{ArrowNumericType, ArrowPrimitiveType}; use crate::downcast_dictionary_array; use crate::error::{ArrowError, Result}; use crate::util::bit_iterator::try_for_each_valid_idx; +use arrow_buffer::MutableBuffer; use std::sync::Arc; #[inline] @@ -287,16 +289,14 @@ where /// /// Return an error if the arrays have different lengths or /// the operation is under erroneous -pub fn try_binary( - a: &PrimitiveArray, - b: &PrimitiveArray, +pub fn try_binary( + a: A, + b: B, op: F, ) -> Result> where - A: ArrowPrimitiveType, - B: ArrowPrimitiveType, O: ArrowPrimitiveType, - F: Fn(A::Native, B::Native) -> Result, + F: Fn(A::Item, B::Item) -> Result, { if a.len() != b.len() { return Err(ArrowError::ComputeError( @@ -309,36 +309,52 @@ where let len = a.len(); if a.null_count() == 0 && b.null_count() == 0 { - let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r)); - let buffer = unsafe { Buffer::try_from_trusted_len_iter(values) }?; - // JUSTIFICATION - // Benefit - // ~75% speedup - // Soundness - // `values` is an iterator with a known size from a PrimitiveArray - return Ok(unsafe { build_primitive_array(len, buffer, 0, None) }); + try_binary_no_nulls(len, a, b, op) + } else { + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits()) + .unwrap_or_default(); + + let mut buffer = BufferBuilder::::new(len); + buffer.append_n_zeroed(len); + let slice = buffer.as_slice_mut(); + + try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + unsafe { + *slice.get_unchecked_mut(idx) = + op(a.value_unchecked(idx), b.value_unchecked(idx))? + }; + Ok::<_, ArrowError>(()) + })?; + + Ok(unsafe { + build_primitive_array(len, buffer.finish(), null_count, null_buffer) + }) } +} - let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); - - let null_count = null_buffer - .as_ref() - .map(|x| len - x.count_set_bits()) - .unwrap_or_default(); - - let mut buffer = BufferBuilder::::new(len); - buffer.append_n_zeroed(len); - let slice = buffer.as_slice_mut(); - - try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { +/// This intentional inline(never) attribute helps LLVM optimize the loop. +#[inline(never)] +fn try_binary_no_nulls( + len: usize, + a: A, + b: B, + op: F, +) -> Result> +where + O: ArrowPrimitiveType, + F: Fn(A::Item, B::Item) -> Result, +{ + let mut buffer = MutableBuffer::new(len * O::get_byte_width()); + for idx in 0..len { unsafe { - *slice.get_unchecked_mut(idx) = - op(a.value_unchecked(idx), b.value_unchecked(idx))? + buffer.push_unchecked(op(a.value_unchecked(idx), b.value_unchecked(idx))?); }; - Ok::<_, ArrowError>(()) - })?; - - Ok(unsafe { build_primitive_array(len, buffer.finish(), null_count, null_buffer) }) + } + Ok(unsafe { build_primitive_array(len, buffer.into(), 0, None) }) } /// Applies the provided binary operation across `a` and `b`, collecting the optional results From 5b601b3065d1c239feef6badf3ff68b6d72916a3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Sep 2022 06:30:55 -0400 Subject: [PATCH 16/16] MINOR: tweak arrow release instructions (#2758) --- dev/release/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dev/release/README.md b/dev/release/README.md index 48748eccbe8..d418a09d070 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -74,9 +74,12 @@ git checkout -b make-release # manully edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh +# run automated script to copy labels to issues based on referenced PRs +python dev/release/label_issues.py # review change log / edit issues and labels if needed, rerun git commit -a -m 'Create changelog' + # update versions sed -i '' -e 's/14.0.0/23.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' @@ -228,6 +231,7 @@ following commands Rust Arrow Crates: ```shell +(cd arrow-buffer && cargo publish) (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish)