From eadab36c1dedffcf6a9c85677ae942f81b7f842c Mon Sep 17 00:00:00 2001 From: psvri Date: Fri, 19 Aug 2022 07:52:22 +0000 Subject: [PATCH 1/4] Refactor decimal builder --- arrow/src/array/array_decimal.rs | 4 ++-- arrow/src/array/builder/decimal_builder.rs | 11 ++++++++--- arrow/src/array/builder/struct_builder.rs | 6 +++--- arrow/src/array/data.rs | 2 +- arrow/src/csv/reader.rs | 2 +- arrow/src/util/integration_util.rs | 2 +- 6 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index 540024a923c..12bb13a1ae4 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -571,7 +571,7 @@ mod tests { #[test] #[cfg(not(feature = "force_validate"))] fn test_decimal_append_error_value() { - let mut decimal_builder = Decimal128Builder::new(10, 5, 3); + let mut decimal_builder = Decimal128Builder::with_capacity(10, 5, 3); let mut result = decimal_builder.append_value(123456); let mut error = result.unwrap_err(); assert_eq!( @@ -588,7 +588,7 @@ mod tests { let arr = decimal_builder.finish(); assert_eq!("12.345", arr.value_as_string(1)); - decimal_builder = Decimal128Builder::new(10, 2, 1); + decimal_builder = Decimal128Builder::new(2, 1); result = decimal_builder.append_value(100); error = result.unwrap_err(); assert_eq!( diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs index 7021dce372e..54754045718 100644 --- a/arrow/src/array/builder/decimal_builder.rs +++ b/arrow/src/array/builder/decimal_builder.rs @@ -61,9 +61,14 @@ pub struct Decimal256Builder { impl Decimal128Builder { const BYTE_LENGTH: i32 = 16; + /// Creates a new [`Decimal128Builder`] + pub fn new(precision: usize, scale: usize) -> Self { + Self::with_capacity(1024, precision, scale) + } + /// Creates a new [`Decimal128Builder`], `capacity` is the number of bytes in the values /// array - pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { + pub fn with_capacity(capacity: usize, precision: usize, scale: usize) -> Self { Self { builder: FixedSizeBinaryBuilder::new(capacity, Self::BYTE_LENGTH), precision, @@ -245,7 +250,7 @@ mod tests { #[test] fn test_decimal_builder() { - let mut builder = Decimal128Builder::new(30, 38, 6); + let mut builder = Decimal128Builder::new(38, 6); builder.append_value(8_887_000_000_i128).unwrap(); builder.append_null(); @@ -263,7 +268,7 @@ mod tests { #[test] fn test_decimal_builder_with_decimal128() { - let mut builder = Decimal128Builder::new(30, 38, 6); + let mut builder = Decimal128Builder::new(38, 6); builder .append_value(Decimal128::new_from_i128(30, 38, 8_887_000_000_i128)) diff --git a/arrow/src/array/builder/struct_builder.rs b/arrow/src/array/builder/struct_builder.rs index 01a792b5d19..507273dc82a 100644 --- a/arrow/src/array/builder/struct_builder.rs +++ b/arrow/src/array/builder/struct_builder.rs @@ -111,9 +111,9 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(FixedSizeBinaryBuilder::new(capacity, *len)) } - DataType::Decimal128(precision, scale) => { - Box::new(Decimal128Builder::new(capacity, *precision, *scale)) - } + DataType::Decimal128(precision, scale) => Box::new( + Decimal128Builder::with_capacity(capacity, *precision, *scale), + ), DataType::Utf8 => Box::new(StringBuilder::new(capacity)), DataType::Date32 => Box::new(Date32Builder::new(capacity)), DataType::Date64 => Box::new(Date64Builder::new(capacity)), diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 3993d51d9b8..1c23c4278ea 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -2847,7 +2847,7 @@ mod tests { #[test] fn test_decimal_validation() { - let mut builder = Decimal128Builder::new(4, 10, 4); + let mut builder = Decimal128Builder::with_capacity(4, 10, 4); builder.append_value(10000).unwrap(); builder.append_value(20000).unwrap(); let array = builder.finish(); diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index ac26b377ffe..b90d92dae6e 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -699,7 +699,7 @@ fn build_decimal_array( precision: usize, scale: usize, ) -> Result { - let mut decimal_builder = Decimal128Builder::new(rows.len(), precision, scale); + let mut decimal_builder = Decimal128Builder::with_capacity(rows.len(), precision, scale); for row in rows { let col_s = row.get(col_idx); match col_s { diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index ee5c947a2ff..f241b745649 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -776,7 +776,7 @@ pub fn array_from_json( } } DataType::Decimal128(precision, scale) => { - let mut b = Decimal128Builder::new(json_col.count, *precision, *scale); + let mut b = Decimal128Builder::with_capacity(json_col.count, *precision, *scale); // C++ interop tests involve incompatible decimal values unsafe { b.disable_value_validation(); From 38cb03e406e8b988653e08adbe43c891ee185106 Mon Sep 17 00:00:00 2001 From: psvri Date: Fri, 19 Aug 2022 08:09:16 +0000 Subject: [PATCH 2/4] Fixing errors --- arrow/benches/builder.rs | 2 +- arrow/benches/cast_kernels.rs | 2 +- arrow/benches/decimal_validate.rs | 2 +- arrow/src/csv/reader.rs | 3 ++- arrow/src/util/integration_util.rs | 3 ++- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arrow/benches/builder.rs b/arrow/benches/builder.rs index 691cd068312..b8b31351c7a 100644 --- a/arrow/benches/builder.rs +++ b/arrow/benches/builder.rs @@ -112,7 +112,7 @@ fn bench_decimal128(c: &mut Criterion) { c.bench_function("bench_decimal128_builder", |b| { b.iter(|| { let mut rng = rand::thread_rng(); - let mut decimal_builder = Decimal128Builder::new(BATCH_SIZE, 38, 0); + let mut decimal_builder = Decimal128Builder::with_capacity(BATCH_SIZE, 38, 0); for _ in 0..BATCH_SIZE { decimal_builder .append_value(rng.gen_range::(0..9999999999)) diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index a7bae35e712..93b0c8a961a 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -84,7 +84,7 @@ fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef { fn build_decimal128_array(size: usize, precision: usize, scale: usize) -> ArrayRef { let mut rng = seedable_rng(); - let mut builder = Decimal128Builder::new(size, precision, scale); + let mut builder = Decimal128Builder::with_capacity(size, precision, scale); for _ in 0..size { let _ = builder.append_value(rng.gen_range::(0..1000000000)); diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs index 1c9406abc78..8aae9cf3464 100644 --- a/arrow/benches/decimal_validate.rs +++ b/arrow/benches/decimal_validate.rs @@ -40,7 +40,7 @@ fn validate_decimal256_array(array: Decimal256Array) { fn validate_decimal128_benchmark(c: &mut Criterion) { let mut rng = rand::thread_rng(); let size: i128 = 20000; - let mut decimal_builder = Decimal128Builder::new(size as usize, 38, 0); + let mut decimal_builder = Decimal128Builder::with_capacity(size as usize, 38, 0); for _ in 0..size { decimal_builder .append_value(rng.gen_range::(0..999999999999)) diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index b90d92dae6e..5917f6797dc 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -699,7 +699,8 @@ fn build_decimal_array( precision: usize, scale: usize, ) -> Result { - let mut decimal_builder = Decimal128Builder::with_capacity(rows.len(), precision, scale); + let mut decimal_builder = + Decimal128Builder::with_capacity(rows.len(), precision, scale); for row in rows { let col_s = row.get(col_idx); match col_s { diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index f241b745649..be490eebf49 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -776,7 +776,8 @@ pub fn array_from_json( } } DataType::Decimal128(precision, scale) => { - let mut b = Decimal128Builder::with_capacity(json_col.count, *precision, *scale); + let mut b = + Decimal128Builder::with_capacity(json_col.count, *precision, *scale); // C++ interop tests involve incompatible decimal values unsafe { b.disable_value_validation(); From d577878d9a3293c83c422a4b4082d5feb405e137 Mon Sep 17 00:00:00 2001 From: psvri Date: Fri, 19 Aug 2022 15:10:30 +0000 Subject: [PATCH 3/4] refactor decimal256builder constructor --- arrow/benches/builder.rs | 3 ++- arrow/benches/cast_kernels.rs | 2 +- arrow/benches/decimal_validate.rs | 2 +- arrow/src/array/array_decimal.rs | 2 +- arrow/src/array/builder/decimal_builder.rs | 15 ++++++++++----- arrow/src/util/integration_util.rs | 3 ++- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/arrow/benches/builder.rs b/arrow/benches/builder.rs index b8b31351c7a..6984b607d76 100644 --- a/arrow/benches/builder.rs +++ b/arrow/benches/builder.rs @@ -127,7 +127,8 @@ fn bench_decimal256(c: &mut Criterion) { c.bench_function("bench_decimal128_builder", |b| { b.iter(|| { let mut rng = rand::thread_rng(); - let mut decimal_builder = Decimal256Builder::new(BATCH_SIZE, 76, 10); + let mut decimal_builder = + Decimal256Builder::with_capacity(BATCH_SIZE, 76, 10); for _ in 0..BATCH_SIZE { decimal_builder .append_value(&Decimal256::from(BigInt::from( diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index 93b0c8a961a..4e52e76842c 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -94,7 +94,7 @@ fn build_decimal128_array(size: usize, precision: usize, scale: usize) -> ArrayR fn build_decimal256_array(size: usize, precision: usize, scale: usize) -> ArrayRef { let mut rng = seedable_rng(); - let mut builder = Decimal256Builder::new(size, precision, scale); + let mut builder = Decimal256Builder::with_capacity(size, precision, scale); let mut bytes = [0; 32]; for _ in 0..size { let num = rng.gen_range::(0..1000000000); diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs index 8aae9cf3464..555373e4a63 100644 --- a/arrow/benches/decimal_validate.rs +++ b/arrow/benches/decimal_validate.rs @@ -59,7 +59,7 @@ fn validate_decimal128_benchmark(c: &mut Criterion) { fn validate_decimal256_benchmark(c: &mut Criterion) { let mut rng = rand::thread_rng(); let size: i128 = 20000; - let mut decimal_builder = Decimal256Builder::new(size as usize, 76, 0); + let mut decimal_builder = Decimal256Builder::with_capacity(size as usize, 76, 0); for _ in 0..size { let v = rng.gen_range::(0..999999999999999); let decimal = Decimal256::from_big_int(&BigInt::from(v), 76, 0).unwrap(); diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index 12bb13a1ae4..aa25e66a90c 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -884,7 +884,7 @@ mod tests { #[test] fn test_decimal256_iter() { - let mut builder = Decimal256Builder::new(30, 76, 6); + let mut builder = Decimal256Builder::with_capacity(30, 76, 6); let value = BigInt::from_str_radix("12345", 10).unwrap(); let decimal1 = Decimal256::from_big_int(&value, 76, 6).unwrap(); builder.append_value(&decimal1).unwrap(); diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs index 54754045718..880537e43cd 100644 --- a/arrow/src/array/builder/decimal_builder.rs +++ b/arrow/src/array/builder/decimal_builder.rs @@ -160,9 +160,14 @@ impl ArrayBuilder for Decimal128Builder { impl Decimal256Builder { const BYTE_LENGTH: i32 = 32; + /// Creates a new [`Decimal256Builder`] + pub fn new(precision: usize, scale: usize) -> Self { + Self::with_capacity(1024, precision, scale) + } + /// Creates a new [`Decimal256Builder`], `capacity` is the number of bytes in the values /// array - pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { + pub fn with_capacity(capacity: usize, precision: usize, scale: usize) -> Self { Self { builder: FixedSizeBinaryBuilder::new(capacity, Self::BYTE_LENGTH), precision, @@ -288,7 +293,7 @@ mod tests { #[test] fn test_decimal256_builder() { - let mut builder = Decimal256Builder::new(30, 40, 6); + let mut builder = Decimal256Builder::new(40, 6); let mut bytes = [0_u8; 32]; bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); @@ -332,7 +337,7 @@ mod tests { expected = "Decimal value does not have the same precision or scale as Decimal256Builder" )] fn test_decimal256_builder_unmatched_precision_scale() { - let mut builder = Decimal256Builder::new(30, 10, 6); + let mut builder = Decimal256Builder::with_capacity(30, 10, 6); let mut bytes = [0_u8; 32]; bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); @@ -345,7 +350,7 @@ mod tests { expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 75. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" )] fn test_decimal256_builder_out_of_range_precision_scale() { - let mut builder = Decimal256Builder::new(30, 75, 6); + let mut builder = Decimal256Builder::new(75, 6); let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap(); let value = Decimal256::from_big_int(&big_value, 75, 6).unwrap(); @@ -357,7 +362,7 @@ mod tests { expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 75. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" )] fn test_decimal256_data_validation() { - let mut builder = Decimal256Builder::new(30, 75, 6); + let mut builder = Decimal256Builder::new(75, 6); // Disable validation at builder unsafe { builder.disable_value_validation(); diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index be490eebf49..7fa3f082c67 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -799,7 +799,8 @@ pub fn array_from_json( Ok(Arc::new(b.finish())) } DataType::Decimal256(precision, scale) => { - let mut b = Decimal256Builder::new(json_col.count, *precision, *scale); + let mut b = + Decimal256Builder::with_capacity(json_col.count, *precision, *scale); // C++ interop tests involve incompatible decimal values unsafe { b.disable_value_validation(); From 82c7e754997154dbe18d326e24ac68e5358b4e44 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Sat, 20 Aug 2022 15:39:58 +0100 Subject: [PATCH 4/4] Capacity in terms of elements --- arrow/src/array/builder/decimal_builder.rs | 8 ++++---- .../array/builder/fixed_size_binary_builder.rs | 16 ++++++---------- arrow/src/array/equal/mod.rs | 2 +- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs index 4523bd3bb98..b4c877c7110 100644 --- a/arrow/src/array/builder/decimal_builder.rs +++ b/arrow/src/array/builder/decimal_builder.rs @@ -66,8 +66,8 @@ impl Decimal128Builder { Self::with_capacity(1024, precision, scale) } - /// Creates a new [`Decimal128Builder`], `capacity` is the number of bytes in the values - /// array + /// Creates a new [`Decimal128Builder`], `capacity` is the number of decimal values + /// that can be appended without reallocating pub fn with_capacity(capacity: usize, precision: usize, scale: usize) -> Self { Self { builder: FixedSizeBinaryBuilder::with_capacity(capacity, Self::BYTE_LENGTH), @@ -165,8 +165,8 @@ impl Decimal256Builder { Self::with_capacity(1024, precision, scale) } - /// Creates a new [`Decimal256Builder`], `capacity` is the number of bytes in the values - /// array + /// Creates a new [`Decimal256Builder`], `capacity` is the number of decimal values + /// that can be appended without reallocating pub fn with_capacity(capacity: usize, precision: usize, scale: usize) -> Self { Self { builder: FixedSizeBinaryBuilder::with_capacity(capacity, Self::BYTE_LENGTH), diff --git a/arrow/src/array/builder/fixed_size_binary_builder.rs b/arrow/src/array/builder/fixed_size_binary_builder.rs index f0483ccfb7d..30c25e0a62b 100644 --- a/arrow/src/array/builder/fixed_size_binary_builder.rs +++ b/arrow/src/array/builder/fixed_size_binary_builder.rs @@ -38,8 +38,8 @@ impl FixedSizeBinaryBuilder { Self::with_capacity(1024, byte_width) } - /// Creates a new [`FixedSizeBinaryBuilder`], `capacity` is the number of bytes in the values - /// buffer + /// Creates a new [`FixedSizeBinaryBuilder`], `capacity` is the number of byte slices + /// that can be appended without reallocating pub fn with_capacity(capacity: usize, byte_width: i32) -> Self { assert!( byte_width >= 0, @@ -47,12 +47,8 @@ impl FixedSizeBinaryBuilder { byte_width ); Self { - values_builder: UInt8BufferBuilder::new(capacity), - null_buffer_builder: NullBufferBuilder::new(if byte_width > 0 { - capacity / byte_width as usize - } else { - 0 - }), + values_builder: UInt8BufferBuilder::new(capacity * byte_width as usize), + null_buffer_builder: NullBufferBuilder::new(capacity), value_length: byte_width, } } @@ -137,7 +133,7 @@ mod tests { #[test] fn test_fixed_size_binary_builder() { - let mut builder = FixedSizeBinaryBuilder::with_capacity(15, 5); + let mut builder = FixedSizeBinaryBuilder::with_capacity(3, 5); // [b"hello", null, "arrow"] builder.append_value(b"hello").unwrap(); @@ -176,7 +172,7 @@ mod tests { expected = "Byte slice does not have the same length as FixedSizeBinaryBuilder value lengths" )] fn test_fixed_size_binary_builder_with_inconsistent_value_length() { - let mut builder = FixedSizeBinaryBuilder::with_capacity(15, 4); + let mut builder = FixedSizeBinaryBuilder::with_capacity(1, 4); builder.append_value(b"hello").unwrap(); } #[test] diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs index dd32574bbc6..2984dad8972 100644 --- a/arrow/src/array/equal/mod.rs +++ b/arrow/src/array/equal/mod.rs @@ -768,7 +768,7 @@ mod tests { fn create_fixed_size_binary_array, T: AsRef<[Option]>>( data: T, ) -> ArrayData { - let mut builder = FixedSizeBinaryBuilder::with_capacity(15, 5); + let mut builder = FixedSizeBinaryBuilder::with_capacity(data.as_ref().len(), 5); for d in data.as_ref() { if let Some(v) = d {