diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index ecf1850ad45..f5b25acb175 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -1375,4 +1375,34 @@ mod tests { assert_eq!(2, arr.value_length()); assert_eq!(5, arr.len()) } + + #[test] + fn test_binary_array_all_null() { + let data = vec![None]; + let array = BinaryArray::from(data); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } + + #[test] + fn test_large_binary_array_all_null() { + let data = vec![None]; + let array = LargeBinaryArray::from(data); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } + + #[test] + fn fixed_size_binary_array_all_null() { + let data = vec![None] as Vec>; + let array = FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap(); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } } diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index c684c253aa7..75d4f3ab3fa 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -412,4 +412,14 @@ mod tests { assert_eq!(1, keys.value(2)); assert_eq!(0, keys.value(5)); } + + #[test] + fn test_dictionary_all_nulls() { + let test = vec![None, None, None]; + let array: DictionaryArray = test.into_iter().collect(); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } } diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 0d8dcf1a68e..82c9f16fe99 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -554,13 +554,22 @@ mod tests { } #[test] - fn test_string_array_from_string_vec() { - let data = vec!["Foo".to_owned(), "Bar".to_owned(), "Baz".to_owned()]; + fn test_string_array_all_null() { + let data = vec![None]; let array = StringArray::from(data); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } - assert_eq!(array.len(), 3); - assert_eq!(array.value(0), "Foo"); - assert_eq!(array.value(1), "Bar"); - assert_eq!(array.value(2), "Baz"); + #[test] + fn test_large_string_array_all_null() { + let data = vec![None]; + let array = LargeStringArray::from(data); + array + .data() + .validate_full() + .expect("All null array has valid array data"); } } diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 40a8beebf51..728483795de 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -20,6 +20,7 @@ use std::convert::TryInto; use std::mem; +use std::ops::Range; use std::sync::Arc; use crate::datatypes::{DataType, IntervalUnit}; @@ -319,7 +320,7 @@ impl ArrayData { buffers: Vec, child_data: Vec, ) -> Result { - // Safetly justification: `validate` is (will be) called below + // Safety justification: `validate_full` is called below let new_self = unsafe { Self::new_unchecked( data_type, @@ -332,7 +333,8 @@ impl ArrayData { ) }; - new_self.validate()?; + // As the data is not trusted, do a full validation of its contents + new_self.validate_full()?; Ok(new_self) } @@ -579,7 +581,8 @@ impl ArrayData { /// contents of the buffers (e.g. that all offsets for UTF8 arrays /// are within the bounds of the values buffer). /// - /// TODO: add a validate_full that validates the offsets and valid utf8 data + /// See [`validate_full`] to validate fully the offset content + /// and the validitiy of utf8 data pub fn validate(&self) -> Result<()> { // Need at least this mich space in each buffer let len_plus_offset = self.len + self.offset; @@ -686,19 +689,20 @@ impl ArrayData { Ok(()) } - /// Does a cheap sanity check that the `self.len` values in `buffer` are valid - /// offsets (of type T> into some other buffer of `values_length` bytes long - fn validate_offsets( - &self, - buffer: &Buffer, - values_length: usize, - ) -> Result<()> { + /// Returns a reference to the data in `buffer` as a typed slice + /// (typically `&[i32]` or `&[i64]`) after validating. The + /// returned slice is guaranteed to have at least `self.len + 1` + /// entries + fn typed_offsets<'a, T: ArrowNativeType + num::Num + std::fmt::Display>( + &'a self, + buffer: &'a Buffer, + ) -> Result<&'a [T]> { // Validate that there are the correct number of offsets for this array's length let required_offsets = self.len + self.offset + 1; // An empty list-like array can have 0 offsets if buffer.is_empty() { - return Ok(()); + return Ok(&[]); } if (buffer.len() / std::mem::size_of::()) < required_offsets { @@ -709,7 +713,23 @@ impl ArrayData { } // Justification: buffer size was validated above - let offsets = unsafe { &(buffer.typed_data::()[self.offset..]) }; + Ok(unsafe { + &(buffer.typed_data::()[self.offset..self.offset + self.len + 1]) + }) + } + + /// Does a cheap sanity check that the `self.len` values in `buffer` are valid + /// offsets (of type T) into some other buffer of `values_length` bytes long + fn validate_offsets( + &self, + buffer: &Buffer, + values_length: usize, + ) -> Result<()> { + // Justification: buffer size was validated above + let offsets = self.typed_offsets::(buffer)?; + if offsets.is_empty() { + return Ok(()); + } let first_offset = offsets[0].to_usize().ok_or_else(|| { ArrowError::InvalidArgumentError(format!( @@ -875,6 +895,251 @@ impl ArrayData { values_data.validate()?; Ok(values_data) } + + /// "expensive" validation that ensures: + /// + /// 1. Null count is correct + /// 2. All offsets are valid + /// 3. All String data is valid UTF-8 + /// 3. All dictionary offsets are valid + /// + /// Does not (yet) check + /// 1. Union type_ids are valid (see https://github.com/apache/arrow-rs/issues/85) + /// Note calls `validate()` internally + pub fn validate_full(&self) -> Result<()> { + // Check all buffer sizes prior to looking at them more deeply in this function + self.validate()?; + + let null_bitmap_buffer = self + .null_bitmap + .as_ref() + .map(|null_bitmap| null_bitmap.buffer_ref()); + + let actual_null_count = count_nulls(null_bitmap_buffer, self.offset, self.len); + if actual_null_count != self.null_count { + return Err(ArrowError::InvalidArgumentError(format!( + "null_count value ({}) doesn't match actual number of nulls in array ({})", + self.null_count, actual_null_count + ))); + } + + match &self.data_type { + DataType::Utf8 => { + self.validate_utf8::()?; + } + DataType::LargeUtf8 => { + self.validate_utf8::()?; + } + DataType::Binary => { + self.validate_offsets_full::(self.buffers[1].len())?; + } + DataType::LargeBinary => { + self.validate_offsets_full::(self.buffers[1].len())?; + } + DataType::List(_) | DataType::Map(_, _) => { + let child = &self.child_data[0]; + self.validate_offsets_full::(child.len + child.offset)?; + } + DataType::LargeList(_) => { + let child = &self.child_data[0]; + self.validate_offsets_full::(child.len + child.offset)?; + } + DataType::Union(_) => { + // Validate Union Array as part of implementing new Union semantics + // See comments in `ArrayData::validate()` + // https://github.com/apache/arrow-rs/issues/85 + } + DataType::Dictionary(key_type, _value_type) => { + let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); + let max_value = dictionary_length - 1; + match key_type.as_ref() { + DataType::UInt8 => self.check_bounds::(max_value)?, + DataType::UInt16 => self.check_bounds::(max_value)?, + DataType::UInt32 => self.check_bounds::(max_value)?, + DataType::UInt64 => self.check_bounds::(max_value)?, + DataType::Int8 => self.check_bounds::(max_value)?, + DataType::Int16 => self.check_bounds::(max_value)?, + DataType::Int32 => self.check_bounds::(max_value)?, + DataType::Int64 => self.check_bounds::(max_value)?, + _ => unreachable!(), + } + } + _ => { + // No extra validation check required for other types + } + }; + + // validate all children recursively + self.child_data + .iter() + .enumerate() + .try_for_each(|(i, child_data)| { + child_data.validate_full().map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "{} child #{} invalid: {}", + self.data_type, i, e + )) + }) + })?; + + Ok(()) + } + + /// Calls the `validate(item_index, range)` function for each of + /// the ranges specified in the arrow offset buffer of type + /// `T`. Also validates that each offset is smaller than + /// `max_offset` + /// + /// For example, the offset buffer contained `[1, 2, 4]`, this + /// function would call `validate([1,2])`, and `validate([2,4])` + fn validate_each_offset( + &self, + offset_buffer: &Buffer, + offset_limit: usize, + validate: V, + ) -> Result<()> + where + T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + V: Fn(usize, Range) -> Result<()>, + { + // An empty binary-like array can have 0 offsets + if self.len == 0 && offset_buffer.is_empty() { + return Ok(()); + } + + let offsets = self.typed_offsets::(offset_buffer)?; + + offsets + .iter() + .zip(offsets.iter().skip(1)) + .enumerate() + .map(|(i, (&start_offset, &end_offset))| { + let start_offset: usize = start_offset + .try_into() + .map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: could not convert start_offset {} to usize in slot {}", + start_offset, i)) + })?; + let end_offset: usize = end_offset + .try_into() + .map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: Could not convert end_offset {} to usize in slot {}", + end_offset, i+1)) + })?; + + if start_offset > offset_limit { + return Err(ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: offset for slot {} out of bounds: {} > {}", + i, start_offset, offset_limit)) + ); + } + + if end_offset > offset_limit { + return Err(ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: offset for slot {} out of bounds: {} > {}", + i, end_offset, offset_limit)) + ); + } + + // check range actually is low -> high + if start_offset > end_offset { + return Err(ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", + i, start_offset, end_offset)) + ); + } + + Ok((i, start_offset..end_offset)) + }) + .try_for_each(|res: Result<(usize, Range)>| { + let (item_index, range) = res?; + validate(item_index, range) + }) + } + + /// Ensures that all strings formed by the offsets in buffers[0] + /// into buffers[1] are valid utf8 sequences + fn validate_utf8(&self) -> Result<()> + where + T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + { + let offset_buffer = &self.buffers[0]; + let values_buffer = &self.buffers[1].as_slice(); + + self.validate_each_offset::( + offset_buffer, + values_buffer.len(), + |string_index, range| { + std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Invalid UTF8 sequence at string index {} ({:?}): {}", + string_index, range, e + )) + })?; + Ok(()) + }, + ) + } + + /// Ensures that all offsets in buffers[0] into buffers[1] are + /// between `0` and `offset_limit` + fn validate_offsets_full(&self, offset_limit: usize) -> Result<()> + where + T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + { + let offset_buffer = &self.buffers[0]; + + self.validate_each_offset::( + offset_buffer, + offset_limit, + |_string_index, _range| { + // No validation applied to each value, but the iteration + // itself applies bounds checking to each range + Ok(()) + }, + ) + } + + /// Validates that each value in self.buffers (typed as T) + /// is within the range [0, max_value], inclusive + fn check_bounds(&self, max_value: i64) -> Result<()> + where + T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, + { + let required_len = self.len + self.offset; + let buffer = &self.buffers[0]; + + // This should have been checked as part of `validate()` prior + // to calling `validate_full()` but double check to be sure + assert!(buffer.len() / std::mem::size_of::() >= required_len); + + // Justification: buffer size was validated above + let indexes: &[T] = + unsafe { &(buffer.typed_data::()[self.offset..self.offset + self.len]) }; + + indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { + // Do not check the value is null (value can be arbitrary) + if self.is_null(i) { + return Ok(()); + } + let dict_index: i64 = dict_index.try_into().map_err(|_| { + ArrowError::InvalidArgumentError(format!( + "Value at position {} out of bounds: {} (can not convert to i64)", + i, dict_index + )) + })?; + + if dict_index < 0 || dict_index > max_value { + return Err(ArrowError::InvalidArgumentError(format!( + "Value at position {} out of bounds: {} (should be in [0, {}])", + i, dict_index, max_value + ))); + } + Ok(()) + }) + } } /// Return the expected [`DataTypeLayout`] Arrays of this data @@ -1115,6 +1380,7 @@ mod tests { use crate::array::{ Array, BooleanBuilder, Int32Array, Int32Builder, StringArray, StructBuilder, + UInt64Array, }; use crate::buffer::Buffer; use crate::datatypes::Field; @@ -1632,6 +1898,360 @@ mod tests { .unwrap(); } + /// Test that the array of type `data_type` that has invalid utf8 data errors + fn check_utf8_validation(data_type: DataType) { + // 0x80 is a utf8 continuation sequence and is not a valid utf8 sequence itself + let data_buffer = Buffer::from_slice_ref(&[b'a', b'a', 0x80, 0x00]); + let offsets: Vec = [0, 2, 3] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 2, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] + fn test_validate_utf8_content() { + check_utf8_validation::(DataType::Utf8); + } + + #[test] + #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] + fn test_validate_large_utf8_content() { + check_utf8_validation::(DataType::LargeUtf8); + } + + /// Test that the array of type `data_type` that has invalid indexes (out of bounds) + fn check_index_out_of_bounds_validation(data_type: DataType) { + let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + // First two offsets are fine, then 5 is out of bounds + let offsets: Vec = [0, 1, 2, 5, 2] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 4, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + )] + fn test_validate_utf8_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::Utf8); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + )] + fn test_validate_large_utf8_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::LargeUtf8); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + )] + fn test_validate_binary_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::Binary); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + )] + fn test_validate_large_binary_out_of_bounds() { + check_index_out_of_bounds_validation::(DataType::LargeBinary); + } + + // validate that indexes don't go bacwards check indexes that go backwards + fn check_index_backwards_validation(data_type: DataType) { + let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); + // First three offsets are fine, then 1 goes backwards + let offsets: Vec = [0, 1, 2, 2, 1] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + + let offsets_buffer = Buffer::from_slice_ref(&offsets); + ArrayData::try_new( + data_type, + 4, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" + )] + fn test_validate_utf8_index_backwards() { + check_index_backwards_validation::(DataType::Utf8); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" + )] + fn test_validate_large_utf8_index_backwards() { + check_index_backwards_validation::(DataType::LargeUtf8); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" + )] + fn test_validate_binary_index_backwards() { + check_index_backwards_validation::(DataType::Binary); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" + )] + fn test_validate_large_binary_index_backwards() { + check_index_backwards_validation::(DataType::LargeBinary); + } + + #[test] + #[should_panic( + expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])" + )] + fn test_validate_dictionary_index_too_large() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // 3 is not a valid index into the values (only 0 and 1) + let keys: Int32Array = [Some(1), Some(3)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Value at position 1 out of bounds: -1 (should be in [0, 1]" + )] + fn test_validate_dictionary_index_negative() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all! + let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + fn test_validate_dictionary_index_negative_but_not_referenced() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all, but the array is length 1 + // so the -1 should not be looked at + let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + // Expect this not to panic + ArrayData::try_new( + data_type, + 1, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Value at position 0 out of bounds: 18446744073709551615 (can not convert to i64)" + )] + fn test_validate_dictionary_index_giant_negative() { + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + + // -1 is not a valid index at all! + let keys: UInt64Array = [Some(u64::MAX), Some(1)].into_iter().collect(); + + let data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + ArrayData::try_new( + data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + .unwrap(); + } + + /// Test that the list of type `data_type` generates correct offset out of bounds errors + fn check_list_offsets(data_type: DataType) { + let values: Int32Array = + [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + + // 5 is an invalid offset into a list of only three values + let offsets: Vec = [0, 2, 5, 4] + .iter() + .map(|&v| T::from_usize(v).unwrap()) + .collect(); + let offsets_buffer = Buffer::from_slice_ref(&offsets); + + ArrayData::try_new( + data_type, + 3, + None, + None, + 0, + vec![offsets_buffer], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4" + )] + fn test_validate_list_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_offsets::(DataType::List(Box::new(field_type))); + } + + #[test] + #[should_panic( + expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4" + )] + fn test_validate_large_list_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_offsets::(DataType::LargeList(Box::new(field_type))); + } + + /// Test that the list of type `data_type` generates correct errors for negative offsets + #[test] + #[should_panic( + expected = "Offset invariant failure: Could not convert end_offset -1 to usize in slot 2" + )] + fn test_validate_list_negative_offsets() { + let values: Int32Array = + [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + let field_type = Field::new("f", values.data_type().clone(), true); + let data_type = DataType::List(Box::new(field_type)); + + // -1 is an invalid offset any way you look at it + let offsets: Vec = vec![0, 2, -1, 4]; + let offsets_buffer = Buffer::from_slice_ref(&offsets); + + ArrayData::try_new( + data_type, + 3, + None, + None, + 0, + vec![offsets_buffer], + vec![values.data().clone()], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "child #0 invalid: Invalid argument error: Value at position 1 out of bounds: -1 (should be in [0, 1])" + )] + /// test that children are validated recursively (aka bugs in child data of struct also are flagged) + fn test_validate_recursive() { + // Form invalid dictionary array + let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); + // -1 is not a valid index + let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect(); + + let dict_data_type = DataType::Dictionary( + Box::new(keys.data_type().clone()), + Box::new(values.data_type().clone()), + ); + + // purposely create an invalid child data + let dict_data = unsafe { + ArrayData::new_unchecked( + dict_data_type, + 2, + None, + None, + 0, + vec![keys.data().buffers[0].clone()], + vec![values.data().clone()], + ) + }; + + // Now, try and create a struct with this invalid child data (and expect an error) + let data_type = + DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), true)]); + + ArrayData::try_new(data_type, 1, None, None, 0, vec![], vec![dict_data]).unwrap(); + } + /// returns a buffer initialized with some constant value for tests fn make_i32_buffer(n: usize) -> Buffer { Buffer::from_slice_ref(&vec![42i32; n])