Skip to content

Commit

Permalink
Minor: Update doc strings about Page Index / Column Index (#3625)
Browse files Browse the repository at this point in the history
* Minor: Update doc strings about what Page Index / Column Index

* tweaks

* typos

* Apply suggestions from code review

* Update parquet/src/file/metadata.rs

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

---------

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
  • Loading branch information
alamb and viirya committed Jan 31, 2023
1 parent 9c95533 commit f78a9be
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 15 deletions.
29 changes: 27 additions & 2 deletions parquet/src/file/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,25 @@ use crate::schema::types::{
Type as SchemaType,
};

/// [`Index`] for each row group of each column.
///
/// `column_index[row_group_number][column_number]` holds the
/// [`Index`] corresponding to column `column_number` of row group
/// `row_group_number`.
///
/// For example `column_index[2][3]` holds the [`Index`] for the forth
/// column in the third row group of the parquet file.
pub type ParquetColumnIndex = Vec<Vec<Index>>;

/// [`PageLocation`] for each datapage of each row group of each column.
///
/// `offset_index[row_group_number][column_number][page_number]` holds
/// the [`PageLocation`] corresponding to page `page_number` of column
/// `column_number`of row group `row_group_number`.
///
/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
/// the fifth page of the forth column in the third row group of the
/// parquet file.
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;

/// Global Parquet metadata.
Expand All @@ -65,8 +83,8 @@ pub struct ParquetMetaData {
}

impl ParquetMetaData {
/// Creates Parquet metadata from file metadata and a list of row group metadata `Arc`s
/// for each available row group.
/// Creates Parquet metadata from file metadata and a list of row
/// group metadata
pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
ParquetMetaData {
file_metadata,
Expand All @@ -76,6 +94,8 @@ impl ParquetMetaData {
}
}

/// Creates Parquet metadata from file metadata, a list of row
/// group metadata, and the column index structures.
pub fn new_with_page_index(
file_metadata: FileMetaData,
row_groups: Vec<RowGroupMetaData>,
Expand Down Expand Up @@ -232,6 +252,7 @@ pub struct RowGroupMetaData {
sorting_columns: Option<Vec<SortingColumn>>,
total_byte_size: i64,
schema_descr: SchemaDescPtr,
/// `page_offset_index[column_number][page_number]`
page_offset_index: Option<Vec<Vec<PageLocation>>>,
}

Expand Down Expand Up @@ -277,6 +298,8 @@ impl RowGroupMetaData {
}

/// Returns reference of page offset index of all column in this row group.
///
/// The returned vector contains `page_offset[column_number][page_number]`
pub fn page_offset_index(&self) -> Option<&Vec<Vec<PageLocation>>> {
self.page_offset_index.as_ref()
}
Expand All @@ -292,6 +315,8 @@ impl RowGroupMetaData {
}

/// Sets page offset index for this row group.
///
/// The vector represents `page_offset[column_number][page_number]`
pub fn set_page_offset(&mut self, page_offset: Vec<Vec<PageLocation>>) {
self.page_offset_index = Some(page_offset);
}
Expand Down
2 changes: 2 additions & 0 deletions parquet/src/file/page_encoding_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.

//! Per-page encoding information.

use crate::basic::{Encoding, PageType};
use crate::errors::Result;
use crate::format::{
Expand Down
24 changes: 20 additions & 4 deletions parquet/src/file/page_index/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.

//! [`Index`] structures holding decoded [`ColumnIndex`] information

use crate::basic::Type;
use crate::data_type::private::ParquetValueType;
use crate::data_type::{ByteArray, Int96};
Expand All @@ -23,7 +25,14 @@ use crate::format::{BoundaryOrder, ColumnIndex};
use crate::util::bit_util::from_le_slice;
use std::fmt::Debug;

/// The statistics in one page
/// PageIndex Statistics for one data page, as described in [Column Index].
///
/// One significant difference from the row group level
/// [`Statistics`](crate::format::Statistics) is that page level
/// statistics may not store actual column values as min and max
/// (e.g. they may store truncated strings to save space)
///
/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct PageIndex<T> {
/// The minimum value, It is None when all values are null
Expand All @@ -48,6 +57,10 @@ impl<T> PageIndex<T> {

#[derive(Debug, Clone, PartialEq)]
#[allow(non_camel_case_types)]
/// Typed statistics for a data page in a column chunk. This structure
/// is obtained from decoding the [ColumnIndex] in the parquet file
/// and can be used to skip decoding pages while reading the file
/// data.
pub enum Index {
/// Sometimes reading page index from parquet file
/// will only return pageLocations without min_max index,
Expand Down Expand Up @@ -90,14 +103,17 @@ impl Index {
}
}

/// An index of a column of [`Type`] physical representation
/// Stores the [`PageIndex`] for each page of a column with [`Type`]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct NativeIndex<T: ParquetValueType> {
/// The physical type
/// The physical type of this column
pub physical_type: Type,
/// The indexes, one item per page
pub indexes: Vec<PageIndex<T>>,
/// the order
/// If the min/max elements are ordered, and if so in which
/// direction. See [source] for details.
///
/// [source]: https://github.com/apache/parquet-format/blob/bfc549b93e6927cb1fc425466e4084f76edc6d22/src/main/thrift/parquet.thrift#L959-L964
pub boundary_order: BoundaryOrder,
}

Expand Down
28 changes: 24 additions & 4 deletions parquet/src/file/page_index/index_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.

//! Support for reading [`Index`] and [`PageLocation`] from parquet metadata.

use crate::basic::Type;
use crate::data_type::Int96;
use crate::errors::ParquetError;
Expand All @@ -25,8 +27,17 @@ use crate::format::{ColumnIndex, OffsetIndex, PageLocation};
use std::io::{Cursor, Read};
use thrift::protocol::{TCompactInputProtocol, TSerializable};

/// Read on row group's all columns indexes and change into [`Index`]
/// If not the format not available return an empty vector.
/// Reads per-column [`Index`] for all columns of a row group by
/// decoding [`ColumnIndex`] .
///
/// Returns a vector of `index[column_number]`.
///
/// Returns an empty vector if this row group does not contain a
/// [`ColumnIndex`].
///
/// See [Column Index Documentation] for more details.
///
/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub fn read_columns_indexes<R: ChunkReader>(
reader: &R,
chunks: &[ColumnChunkMetaData],
Expand Down Expand Up @@ -60,8 +71,17 @@ pub fn read_columns_indexes<R: ChunkReader>(
.collect()
}

/// Read on row group's all indexes and change into [`Index`]
/// If not the format not available return an empty vector.
/// Reads per-page [`PageLocation`] for all columns of a row group by
/// decoding the [`OffsetIndex`].
///
/// Returns a vector of `location[column_number][page_number]`
///
/// Return an empty vector if this row group does not contain an
/// [`OffsetIndex]`.
///
/// See [Column Index Documentation] for more details.
///
/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub fn read_pages_locations<R: ChunkReader>(
reader: &R,
chunks: &[ColumnChunkMetaData],
Expand Down
4 changes: 4 additions & 0 deletions parquet/src/file/page_index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,9 @@
// specific language governing permissions and limitations
// under the License.

//! Page Index of "[Column Index] Layout to Support Page Skipping"
//!
//! [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md

pub mod index;
pub mod index_reader;
2 changes: 1 addition & 1 deletion parquet/src/file/properties.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

//! Writer properties.
//! [`WriterProperties`]
//!
//! # Usage
//!
Expand Down
5 changes: 3 additions & 2 deletions parquet/src/file/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
// specific language governing permissions and limitations
// under the License.

//! Contains file reader API and provides methods to access file metadata, row group
//! readers to read individual column chunks, or access record iterator.
//! File reader API and methods to access file metadata, row group
//! readers to read individual column chunks, or access record
//! iterator.

use bytes::Bytes;
use std::{boxed::Box, io::Read, sync::Arc};
Expand Down
7 changes: 5 additions & 2 deletions parquet/src/file/serialized_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,16 @@ impl ReadOptionsBuilder {
self
}

/// Enable page index in the reading option,
/// Enable reading the page index structures described in
/// "[Column Index] Layout to Support Page Skipping"
///
/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub fn with_page_index(mut self) -> Self {
self.enable_page_index = true;
self
}

/// Set the `ReaderProperties` configuration.
/// Set the [`ReaderProperties`] configuration.
pub fn with_reader_properties(mut self, properties: ReaderProperties) -> Self {
self.props = Some(properties);
self
Expand Down

0 comments on commit f78a9be

Please sign in to comment.