From b6eaf2243dda9c09806972452c4261a6a9eb4741 Mon Sep 17 00:00:00 2001 From: Yang Jiang Date: Fri, 5 Aug 2022 16:36:51 +0800 Subject: [PATCH] fix: Fix skip error in calculate_row_count. (#2329) * Fix skip error calculate_row_count error * fix ut --- parquet/src/file/serialized_reader.rs | 4 ++- parquet/src/util/page_util.rs | 44 ++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 766813f11ae..034d70e354f 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -1512,7 +1512,9 @@ mod tests { if i != 351 { assert!((meta.num_rows == 21) || (meta.num_rows == 20)); } else { - assert_eq!(meta.num_rows, 11); + // last page first row index is 7290, total row count is 7300 + // because first row start with zero, last page row count should be 10. + assert_eq!(meta.num_rows, 10); } assert!(!meta.is_dict); vec.push(meta); diff --git a/parquet/src/util/page_util.rs b/parquet/src/util/page_util.rs index 5cdcf7535c6..7716b71167f 100644 --- a/parquet/src/util/page_util.rs +++ b/parquet/src/util/page_util.rs @@ -25,7 +25,8 @@ use crate::file::reader::ChunkReader; /// Use column chunk's offset index to get the `page_num` page row count. pub(crate) fn calculate_row_count(indexes: &[PageLocation], page_num: usize, total_row_count: i64) -> Result { if page_num == indexes.len() - 1 { - Ok((total_row_count - indexes[page_num].first_row_index + 1) as usize) + // first_row_index start with 0, so no need to plus one additional. + Ok((total_row_count - indexes[page_num].first_row_index) as usize) } else { Ok((indexes[page_num + 1].first_row_index - indexes[page_num].first_row_index) as usize) } @@ -52,3 +53,44 @@ pub(crate) fn get_pages_readable_slices>(col } Ok((page_readers, has_dictionary_page)) } + +#[cfg(test)] +mod tests { + use super::*; + + /** + parquet-tools meta ./test.parquet got: + + file schema: test_schema + -------------------------------------------------------------------------------- + leaf: REQUIRED INT64 R:0 D: + + row group 1: RC:256 TS:2216 OFFSET:4 + -------------------------------------------------------------------------------- + leaf: INT64 UNCOMPRESSED DO:0 FPO:4 SZ:2216/2216/1.00 VC:256 ENC:PLAIN,RLE ST:[min: 0, max: 255, num_nulls not defined + + parquet-tools column-index -c leaf ./test.parquet got: + + offset index for column leaf: + offset compressed size first row index + page-0 4 554 0 + page-1 558 554 64 + page-2 1112 554 128 + page-3 1666 554 192 + + **/ + #[test] + fn test_calculate_row_count() { + let total_row_count = 256; + let mut indexes = vec![]; + indexes.push(PageLocation::new(4, 554, 0)); + indexes.push(PageLocation::new(558, 554, 64)); + indexes.push(PageLocation::new(1112, 554, 128)); + indexes.push(PageLocation::new(1666, 554, 192)); + for i in 0..4 { + // each page should has 64 rows. + assert_eq!(64, calculate_row_count(indexes.as_slice(), i, total_row_count).unwrap()); + } + + } +}