Speed up Decimal256 validation based on bytes comparison and add be…

…nchmark test (#2360)
apache · Aug 12, 2022 · b235173 · b235173
1 parent e7dcfbc
commit b235173
Show file tree

Hide file tree

Showing 8 changed files with 789 additions and 159 deletions.
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
@@ -209,3 +209,7 @@ required-features = ["test_utils"]
 [[bench]]
 name = "array_data_validate"
 harness = false
+
+[[bench]]
+name = "decimal_validate"
+harness = false
diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[macro_use]
+extern crate criterion;
+
+use arrow::array::{Array, Decimal128Array, Decimal256Array, Decimal256Builder};
+use criterion::Criterion;
+
+extern crate arrow;
+
+use arrow::util::decimal::Decimal256;
+
+fn validate_decimal128_array(array: Decimal128Array) {
+    array.with_precision_and_scale(35, 0).unwrap();
+}
+
+fn validate_decimal256_array(array: Decimal256Array) {
+    array.with_precision_and_scale(35, 0).unwrap();
+}
+
+fn validate_decimal128_benchmark(c: &mut Criterion) {
+    let decimal_array = Decimal128Array::from_iter_values(vec![12324; 20000]);
+    let data = decimal_array.into_data();
+    c.bench_function("validate_decimal128_array 20000", |b| {
+        b.iter(|| {
+            let array = Decimal128Array::from(data.clone());
+            validate_decimal128_array(array);
+        })
+    });
+}
+
+fn validate_decimal256_benchmark(c: &mut Criterion) {
+    let mut decimal_builder = Decimal256Builder::new(20000, 76, 0);
+    let mut bytes = vec![0; 32];
+    bytes[0..16].clone_from_slice(&12324_i128.to_le_bytes());
+    for _ in 0..20000 {
+        decimal_builder
+            .append_value(&Decimal256::new(76, 0, &bytes))
+            .unwrap();
+    }
+    let decimal_array256_data = decimal_builder.finish();
+    let data = decimal_array256_data.into_data();
+    c.bench_function("validate_decimal256_array 20000", |b| {
+        b.iter(|| {
+            let array = Decimal256Array::from(data.clone());
+            validate_decimal256_array(array);
+        })
+    });
+}
+
+criterion_group!(
+    benches,
+    validate_decimal128_benchmark,
+    validate_decimal256_benchmark,
+);
+criterion_main!(benches);
diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs
@@ -29,10 +29,9 @@ use super::{BasicDecimalIter, BooleanBufferBuilder, FixedSizeBinaryArray};
 #[allow(deprecated)]
 pub use crate::array::DecimalIter;
 use crate::buffer::{Buffer, MutableBuffer};
-use crate::datatypes::DataType;
+use crate::datatypes::{validate_decimal256_precision_with_lt_bytes, DataType};
 use crate::datatypes::{
-    validate_decimal256_precision, validate_decimal_precision, DECIMAL256_MAX_PRECISION,
-    DECIMAL_DEFAULT_SCALE,
+    validate_decimal_precision, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE,
 };
 use crate::error::{ArrowError, Result};
 use crate::util::decimal::{BasicDecimal, Decimal256};
@@ -271,13 +270,11 @@ impl Decimal128Array {
         Decimal128Array::from(data)
     }
 
-    /// Validates decimal values in this array can be properly interpreted
-    /// with the specified precision.
-    pub fn validate_decimal_precision(&self, precision: usize) -> Result<()> {
-        if precision < self.precision {
-            for v in self.iter().flatten() {
-                validate_decimal_precision(v.as_i128(), precision)?;
-            }
+    // Validates decimal values in this array can be properly interpreted
+    // with the specified precision.
+    fn validate_decimal_precision(&self, precision: usize) -> Result<()> {
+        for v in self.iter().flatten() {
+            validate_decimal_precision(v.as_i128(), precision)?;
         }
         Ok(())
     }
@@ -317,7 +314,9 @@ impl Decimal128Array {
         // Ensure that all values are within the requested
         // precision. For performance, only check if the precision is
         // decreased
-        self.validate_decimal_precision(precision)?;
+        if precision < self.precision {
+            self.validate_decimal_precision(precision)?;
+        }
 
         let data_type = Self::TYPE_CONSTRUCTOR(self.precision, self.scale);
         assert_eq!(self.data().data_type(), &data_type);
@@ -330,15 +329,23 @@ impl Decimal128Array {
 }
 
 impl Decimal256Array {
-    /// Validates decimal values in this array can be properly interpreted
-    /// with the specified precision.
-    pub fn validate_decimal_precision(&self, precision: usize) -> Result<()> {
-        if precision < self.precision {
-            for v in self.iter().flatten() {
-                validate_decimal256_precision(&v.to_big_int(), precision)?;
+    // Validates decimal values in this array can be properly interpreted
+    // with the specified precision.
+    fn validate_decimal_precision(&self, precision: usize) -> Result<()> {
+        (0..self.len()).try_for_each(|idx| {
+            if self.is_valid(idx) {
+                let raw_val = unsafe {
+                    let pos = self.value_offset(idx);
+                    std::slice::from_raw_parts(
+                        self.raw_value_data_ptr().offset(pos as isize),
+                        Self::VALUE_LENGTH as usize,
+                    )
+                };
+                validate_decimal256_precision_with_lt_bytes(raw_val, precision)
+            } else {
+                Ok(())
             }
-        }
-        Ok(())
+        })
     }
 
     /// Returns a Decimal array with the same data as self, with the
@@ -376,7 +383,9 @@ impl Decimal256Array {
         // Ensure that all values are within the requested
         // precision. For performance, only check if the precision is
         // decreased
-        self.validate_decimal_precision(precision)?;
+        if precision < self.precision {
+            self.validate_decimal_precision(precision)?;
+        }
 
         let data_type = Self::TYPE_CONSTRUCTOR(self.precision, self.scale);
         assert_eq!(self.data().data_type(), &data_type);

diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use num::BigInt;
 use std::any::Any;
 use std::sync::Arc;
 
@@ -26,7 +25,9 @@ use crate::array::{ArrayBuilder, FixedSizeBinaryBuilder};
 
 use crate::error::{ArrowError, Result};
 
-use crate::datatypes::{validate_decimal256_precision, validate_decimal_precision};
+use crate::datatypes::{
+    validate_decimal256_precision_with_lt_bytes, validate_decimal_precision,
+};
 use crate::util::decimal::Decimal256;
 
 /// Array Builder for [`Decimal128Array`]
@@ -201,8 +202,7 @@ impl Decimal256Builder {
     pub fn append_value(&mut self, value: &Decimal256) -> Result<()> {
         let value = if self.value_validation {
             let raw_bytes = value.raw_value();
-            let integer = BigInt::from_signed_bytes_le(raw_bytes);
-            validate_decimal256_precision(&integer, self.precision)?;
+            validate_decimal256_precision_with_lt_bytes(raw_bytes, self.precision)?;
             value
         } else {
             value
@@ -255,7 +255,7 @@ impl Decimal256Builder {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use num::Num;
+    use num::{BigInt, Num};
 
     use crate::array::array_decimal::Decimal128Array;
     use crate::array::{array_decimal, Array};

diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs
@@ -19,8 +19,8 @@
 //! common attributes and operations for Arrow array.
 
 use crate::datatypes::{
-    validate_decimal256_precision, validate_decimal_precision, DataType, IntervalUnit,
-    UnionMode,
+    validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, DataType,
+    IntervalUnit, UnionMode,
 };
 use crate::error::{ArrowError, Result};
 use crate::util::bit_iterator::BitSliceIterator;
@@ -30,7 +30,6 @@ use crate::{
     util::bit_util,
 };
 use half::f16;
-use num::BigInt;
 use std::convert::TryInto;
 use std::mem;
 use std::ops::Range;
@@ -1049,8 +1048,7 @@ impl ArrayData {
                 for pos in 0..self.len() {
                     let offset = pos * 32;
                     let raw_bytes = &values[offset..offset + 32];
-                    let integer = BigInt::from_signed_bytes_le(raw_bytes);
-                    validate_decimal256_precision(&integer, *p)?;
+                    validate_decimal256_precision_with_lt_bytes(raw_bytes, *p)?;
                 }
                 Ok(())
             }