diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 0ea74f789dd..69831d55011 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -929,6 +929,43 @@ impl ArrayData { Ok(()) } + /// Does a cheap sanity check that the `self.len` values in `buffer` are valid + /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long + fn validate_offsets_and_sizes( + &self, + values_length: usize, + ) -> Result<(), ArrowError> { + let offsets: &[T] = self.typed_buffer(0, self.len)?; + let sizes: &[T] = self.typed_buffer(1, self.len)?; + for i in 0..values_length { + let size = sizes[i].to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Error converting size[{}] ({}) to usize for {}", + i, sizes[i], self.data_type + )) + })?; + let offset = offsets[i].to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Error converting offset[{}] ({}) to usize for {}", + i, offsets[i], self.data_type + )) + })?; + if offset > values_length { + return Err(ArrowError::InvalidArgumentError(format!( + "Size {} at index {} is offset {} is out of bounds for {}", + size, i, offset, self.data_type + ))); + } + if size > values_length - offset { + return Err(ArrowError::InvalidArgumentError(format!( + "Size {} at index {} is larger than the remaining values for {}", + size, i, self.data_type + ))); + } + } + Ok(()) + } + /// Validates the layout of `child_data` ArrayData structures fn validate_child_data(&self) -> Result<(), ArrowError> { match &self.data_type { @@ -942,6 +979,16 @@ impl ArrayData { self.validate_offsets::(values_data.len)?; Ok(()) } + DataType::ListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets_and_sizes::(values_data.len)?; + Ok(()) + } + DataType::LargeListView(field) => { + let values_data = self.get_single_valid_child_data(field.data_type())?; + self.validate_offsets_and_sizes::(values_data.len)?; + Ok(()) + } DataType::FixedSizeList(field, list_size) => { let values_data = self.get_single_valid_child_data(field.data_type())?; @@ -1546,9 +1593,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data DataType::List(_) => DataTypeLayout::new_fixed_width::(), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } + DataType::ListView(_) => DataTypeLayout::new_list_view::(), + DataType::LargeListView(_) => DataTypeLayout::new_list_view::(), DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), DataType::Map(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data, @@ -1661,6 +1707,24 @@ impl DataTypeLayout { variadic: true, } } + + /// Describes a list view type + pub fn new_list_view() -> Self { + Self { + buffers: vec![ + BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }, + BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }, + ], + can_contain_null_mask: true, + variadic: true, + } + } } /// Layout specification for a single data type buffer diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index f5298f82e0a..41def9051d4 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -342,6 +342,94 @@ fn test_validate_offsets_last_too_large() { .unwrap(); } +/// Test that the list of type `data_type` generates correct offset and size out of bounds errors +fn check_list_view_offsets_sizes( + data_type: DataType, + offsets: Vec, + sizes: Vec, +) { + let values: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); + let offsets_buffer = Buffer::from_slice_ref(offsets); + let sizes_buffer = Buffer::from_slice_ref(sizes); + ArrayData::try_new( + data_type, + 4, + None, + 0, + vec![offsets_buffer, sizes_buffer], + vec![values.into_data()], + ) + .unwrap(); +} + +#[test] +#[should_panic(expected = "Size 3 at index 3 is larger than the remaining values for ListView")] +fn test_validate_list_view_offsets_sizes() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::ListView(Arc::new(field_type)), + vec![0, 1, 1, 2], + vec![1, 1, 1, 3], + ); +} + +#[test] +#[should_panic( + expected = "Size 3 at index 3 is larger than the remaining values for LargeListView" +)] +fn test_validate_large_list_view_offsets_sizes() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::LargeListView(Arc::new(field_type)), + vec![0, 1, 1, 2], + vec![1, 1, 1, 3], + ); +} + +#[test] +#[should_panic(expected = "Error converting offset[1] (-1) to usize for ListView")] +fn test_validate_list_view_negative_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::ListView(Arc::new(field_type)), + vec![0, -1, 1, 2], + vec![1, 1, 1, 3], + ); +} + +#[test] +#[should_panic(expected = "Error converting size[2] (-1) to usize for ListView")] +fn test_validate_list_view_negative_sizes() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::ListView(Arc::new(field_type)), + vec![0, 1, 1, 2], + vec![1, 1, -1, 3], + ); +} + +#[test] +#[should_panic(expected = "Error converting offset[1] (-1) to usize for LargeListView")] +fn test_validate_large_list_view_negative_offsets() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::LargeListView(Arc::new(field_type)), + vec![0, -1, 1, 2], + vec![1, 1, 1, 3], + ); +} + +#[test] +#[should_panic(expected = "Error converting size[2] (-1) to usize for LargeListView")] +fn test_validate_large_list_view_negative_sizes() { + let field_type = Field::new("f", DataType::Int32, true); + check_list_view_offsets_sizes::( + DataType::LargeListView(Arc::new(field_type)), + vec![0, 1, 1, 2], + vec![1, 1, -1, 3], + ); +} + #[test] #[should_panic( expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList"