From 425c01679e77e8396260fe36e5cd4be0d400860f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 17 Oct 2022 18:19:50 +1300 Subject: [PATCH 1/2] Improve row format docs --- arrow/src/row/mod.rs | 85 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index f604f65706d..2c7119f4fcf 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -16,6 +16,67 @@ // under the License. //! A comparable row-oriented representation of a collection of [`Array`] +//! +//! As [`Row`] are [normalized for sorting], they can be very efficiently [compared](PartialOrd), +//! using [`memcmp`] under the hood, or used in [non-comparison sorts] such as [radix sort]. This +//! makes the row format ideal for implementing efficient multi-column sorting, +//! grouping, aggregation, windowing and more. +//! +//! _Comparing [`Rows`] generated by different [`RowConverter`] is not guaranteed to +//! yield a meaningful ordering_ +//! ``` +//! # use std::sync::Arc; +//! # use arrow::row::{RowConverter, SortField}; +//! # use arrow_array::{ArrayRef, Int32Array, StringArray}; +//! # use arrow_array::cast::{as_primitive_array, as_string_array}; +//! # use arrow_array::types::Int32Type; +//! # use arrow_schema::DataType; +//! +//! let a1 = Arc::new(Int32Array::from_iter_values([-1, -1, 0, 3, 3])) as ArrayRef; +//! let a2 = Arc::new(StringArray::from_iter_values(["a", "b", "c", "d", "d"])) as ArrayRef; +//! let arrays = vec![a1, a2]; +//! +//! // Convert arrays to rows +//! let mut converter = RowConverter::new(vec![ +//! SortField::new(DataType::Int32), +//! SortField::new(DataType::Utf8), +//! ]); +//! let rows = converter.convert_columns(&arrays).unwrap(); +//! +//! // Compare rows +//! for i in 0..4 { +//! assert!(rows.row(i) <= rows.row(i + 1)); +//! } +//! assert_eq!(rows.row(3), rows.row(4)); +//! +//! // Convert rows back to arrays +//! let converted = converter.convert_rows(&rows).unwrap(); +//! assert_eq!(arrays, converted); +//! +//! // Compare rows from different arrays +//! let a1 = Arc::new(Int32Array::from_iter_values([3, 4])) as ArrayRef; +//! let a2 = Arc::new(StringArray::from_iter_values(["e", "f"])) as ArrayRef; +//! let arrays = vec![a1, a2]; +//! let rows2 = converter.convert_columns(&arrays).unwrap(); +//! +//! assert!(rows.row(4) < rows2.row(0)); +//! assert!(rows.row(4) < rows2.row(1)); +//! +//! // Convert selection of rows back to arrays +//! let selection = [rows.row(0), rows2.row(1), rows.row(2), rows2.row(0)]; +//! let converted = converter.convert_rows(selection).unwrap(); +//! let c1 = as_primitive_array::(converted[0].as_ref()); +//! assert_eq!(c1.values(), &[-1, 4, 0, 3]); +//! +//! let c2 = as_string_array(converted[1].as_ref()); +//! let c2_values: Vec<_> = c2.iter().flatten().collect(); +//! assert_eq!(&c2_values, &["a", "f", "c", "e"]); +//! ``` +//! +//! [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] +//! [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] +//! [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] +//! [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html] use std::cmp::Ordering; use std::hash::{Hash, Hasher}; @@ -43,14 +104,7 @@ mod fixed; mod interner; mod variable; -/// Converts [`ArrayRef`] columns into a row-oriented format that are [normalized for sorting]. -/// -/// In particular, a byte-wise comparison of the rows, e.g. [`memcmp`], is sufficient -/// to establish the ordering of two rows, allowing for extremely fast comparisons, -/// and permitting the use of [non-comparison sorts] such as [radix sort] -/// -/// Comparing [`Rows`] generated by different [`RowConverter`] is not guaranteed to -/// yield a meaningful ordering +/// Converts [`ArrayRef`] columns into a row-oriented format. /// /// # Format /// @@ -130,17 +184,6 @@ mod variable; /// /// The order of a given column can be reversed by negating the encoded bytes of non-null values /// -/// ## Reconstruction -/// -/// Given a schema it would theoretically be possible to reconstruct the columnar data from -/// the row format, however, this is currently not implemented. It is recommended that the row -/// format is instead used to obtain a sorted list of row indices, which can then be used -/// with [`take`](crate::compute::take) to obtain a sorted [`Array`] -/// -/// [non-comparison sorts]:[https://en.wikipedia.org/wiki/Sorting_algorithm#Non-comparison_sorts] -/// [radix sort]:[https://en.wikipedia.org/wiki/Radix_sort] -/// [normalized for sorting]:[https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.83.1080&rep=rep1&type=pdf] -/// [`memcmp`]:[https://www.man7.org/linux/man-pages/man3/memcmp.3.html] /// [COBS]:[https://en.wikipedia.org/wiki/Consistent_Overhead_Byte_Stuffing] /// [byte stuffing]:[https://en.wikipedia.org/wiki/High-Level_Data_Link_Control#Asynchronous_framing] #[derive(Debug)] @@ -307,6 +350,10 @@ impl Rows { pub fn num_rows(&self) -> usize { self.offsets.len() - 1 } + + pub fn iter(&self) -> RowsIter<'_> { + self.into_iter() + } } impl<'a> IntoIterator for &'a Rows { From 28f46f21c2b266edad853ef4bab3b9ff4f653696 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 17 Oct 2022 20:27:43 +1300 Subject: [PATCH 2/2] Format --- arrow/src/row/mod.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arrow/src/row/mod.rs b/arrow/src/row/mod.rs index 2c7119f4fcf..8d673205441 100644 --- a/arrow/src/row/mod.rs +++ b/arrow/src/row/mod.rs @@ -31,43 +31,43 @@ //! # use arrow_array::cast::{as_primitive_array, as_string_array}; //! # use arrow_array::types::Int32Type; //! # use arrow_schema::DataType; -//! +//! //! let a1 = Arc::new(Int32Array::from_iter_values([-1, -1, 0, 3, 3])) as ArrayRef; //! let a2 = Arc::new(StringArray::from_iter_values(["a", "b", "c", "d", "d"])) as ArrayRef; //! let arrays = vec![a1, a2]; -//! +//! //! // Convert arrays to rows //! let mut converter = RowConverter::new(vec![ //! SortField::new(DataType::Int32), //! SortField::new(DataType::Utf8), //! ]); //! let rows = converter.convert_columns(&arrays).unwrap(); -//! +//! //! // Compare rows //! for i in 0..4 { //! assert!(rows.row(i) <= rows.row(i + 1)); //! } //! assert_eq!(rows.row(3), rows.row(4)); -//! +//! //! // Convert rows back to arrays //! let converted = converter.convert_rows(&rows).unwrap(); //! assert_eq!(arrays, converted); -//! +//! //! // Compare rows from different arrays //! let a1 = Arc::new(Int32Array::from_iter_values([3, 4])) as ArrayRef; //! let a2 = Arc::new(StringArray::from_iter_values(["e", "f"])) as ArrayRef; //! let arrays = vec![a1, a2]; //! let rows2 = converter.convert_columns(&arrays).unwrap(); -//! +//! //! assert!(rows.row(4) < rows2.row(0)); //! assert!(rows.row(4) < rows2.row(1)); -//! +//! //! // Convert selection of rows back to arrays //! let selection = [rows.row(0), rows2.row(1), rows.row(2), rows2.row(0)]; //! let converted = converter.convert_rows(selection).unwrap(); //! let c1 = as_primitive_array::(converted[0].as_ref()); //! assert_eq!(c1.values(), &[-1, 4, 0, 3]); -//! +//! //! let c2 = as_string_array(converted[1].as_ref()); //! let c2_values: Vec<_> = c2.iter().flatten().collect(); //! assert_eq!(&c2_values, &["a", "f", "c", "e"]);