New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fallback for skip_page/peek_page without OffsetIndex in SerializedPag… #2490
Closed
Closed
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -484,13 +484,17 @@ pub(crate) fn decode_page( | |
Ok(result) | ||
} | ||
|
||
#[allow(clippy::large_enum_variant)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Boxing |
||
enum SerializedPageReaderState { | ||
Values { | ||
/// The current byte offset in the reader | ||
offset: usize, | ||
|
||
/// The length of the chunk in bytes | ||
remaining_bytes: usize, | ||
|
||
/// If the next page has been "peeked", we cache the decoded header here | ||
buffered_header: Option<PageHeader>, | ||
}, | ||
Pages { | ||
/// Remaining page locations | ||
|
@@ -550,6 +554,7 @@ impl<R: ChunkReader> SerializedPageReader<R> { | |
None => SerializedPageReaderState::Values { | ||
offset: start as usize, | ||
remaining_bytes: len as usize, | ||
buffered_header: None, | ||
}, | ||
}; | ||
|
||
|
@@ -577,22 +582,38 @@ impl<R: ChunkReader> PageReader for SerializedPageReader<R> { | |
SerializedPageReaderState::Values { | ||
offset, | ||
remaining_bytes: remaining, | ||
.. | ||
buffered_header, | ||
} => { | ||
if *remaining == 0 { | ||
return Ok(None); | ||
} | ||
|
||
let mut read = self.reader.get_read(*offset as u64, *remaining)?; | ||
|
||
let (header_len, header) = read_page_header_len(&mut read)?; | ||
let data_len = header.compressed_page_size as usize; | ||
*offset += header_len + data_len; | ||
*remaining -= header_len + data_len; | ||
let header = if let Some(header) = buffered_header.take() { | ||
if header.type_ == PageType::IndexPage { | ||
continue; | ||
} | ||
|
||
if header.type_ == PageType::IndexPage { | ||
continue; | ||
} | ||
let data_len = header.compressed_page_size as usize; | ||
*offset += data_len; | ||
*remaining -= data_len; | ||
|
||
header | ||
} else { | ||
let (header_len, header) = read_page_header_len(&mut read)?; | ||
let data_len = header.compressed_page_size as usize; | ||
*offset += header_len + data_len; | ||
*remaining -= header_len + data_len; | ||
|
||
if header.type_ == PageType::IndexPage { | ||
continue; | ||
} | ||
|
||
header | ||
}; | ||
|
||
let data_len = header.compressed_page_size as usize; | ||
|
||
let mut buffer = Vec::with_capacity(data_len); | ||
let read = read.take(data_len as u64).read_to_end(&mut buffer)?; | ||
|
@@ -648,20 +669,63 @@ impl<R: ChunkReader> PageReader for SerializedPageReader<R> { | |
} | ||
|
||
fn peek_next_page(&mut self) -> Result<Option<PageMetadata>> { | ||
match &self.state { | ||
SerializedPageReaderState::Values {..} => Err(general_err!("Must set page_offset_index when using peek_next_page in SerializedPageReader.")), | ||
SerializedPageReaderState::Pages { page_locations, dictionary_page, total_rows } => { | ||
match &mut self.state { | ||
SerializedPageReaderState::Values { | ||
offset, | ||
remaining_bytes, | ||
buffered_header, | ||
} => { | ||
loop { | ||
if *remaining_bytes == 0 { | ||
return Ok(None); | ||
} | ||
|
||
return if let Some(header) = buffered_header.as_ref() { | ||
if let Ok(page_metadata) = header.try_into() { | ||
Ok(Some(page_metadata)) | ||
} else { | ||
// For unknown page type (e.g., INDEX_PAGE), skip and read next. | ||
*buffered_header = None; | ||
continue; | ||
} | ||
} else { | ||
let mut read = | ||
self.reader.get_read(*offset as u64, *remaining_bytes)?; | ||
|
||
let (header_len, header) = read_page_header_len(&mut read)?; | ||
|
||
*offset += header_len as usize; | ||
*remaining_bytes -= header_len as usize; | ||
|
||
if let Ok(page_metadata) = (&header).try_into() { | ||
*buffered_header = Some(header); | ||
return Ok(Some(page_metadata)); | ||
} else { | ||
// For unknown page type (e.g., INDEX_PAGE), skip and read next. | ||
continue; | ||
}; | ||
}; | ||
} | ||
} | ||
SerializedPageReaderState::Pages { | ||
page_locations, | ||
dictionary_page, | ||
total_rows, | ||
} => { | ||
if dictionary_page.is_some() { | ||
Ok(Some(PageMetadata{ | ||
Ok(Some(PageMetadata { | ||
num_rows: 0, | ||
is_dict: true | ||
is_dict: true, | ||
})) | ||
} else if let Some(page) = page_locations.front() { | ||
let next_rows = page_locations.get(1).map(|x| x.first_row_index as usize).unwrap_or(*total_rows); | ||
let next_rows = page_locations | ||
.get(1) | ||
.map(|x| x.first_row_index as usize) | ||
.unwrap_or(*total_rows); | ||
|
||
Ok(Some(PageMetadata{ | ||
Ok(Some(PageMetadata { | ||
num_rows: next_rows - page.first_row_index as usize, | ||
is_dict: false | ||
is_dict: false, | ||
})) | ||
} else { | ||
Ok(None) | ||
|
@@ -672,7 +736,33 @@ impl<R: ChunkReader> PageReader for SerializedPageReader<R> { | |
|
||
fn skip_next_page(&mut self) -> Result<()> { | ||
match &mut self.state { | ||
SerializedPageReaderState::Values {..} =>{ Err(general_err!("Must set page_offset_index when using skip_next_page in SerializedPageReader.")) }, | ||
SerializedPageReaderState::Values { | ||
offset, | ||
remaining_bytes, | ||
buffered_header, | ||
} => { | ||
if let Some(header) = buffered_header { | ||
let remaining_in_page = header.compressed_page_size as usize; | ||
|
||
*offset += remaining_in_page; | ||
*remaining_bytes -= remaining_in_page; | ||
|
||
*buffered_header = None; | ||
Ok(()) | ||
} else { | ||
let mut read = | ||
self.reader.get_read(*offset as u64, *remaining_bytes)?; | ||
|
||
let (header_len, header) = read_page_header_len(&mut read)?; | ||
|
||
let data_len = header.compressed_page_size as usize; | ||
|
||
*offset += header_len + data_len; | ||
*remaining_bytes -= header_len + data_len; | ||
|
||
Ok(()) | ||
} | ||
} | ||
SerializedPageReaderState::Pages { page_locations, .. } => { | ||
page_locations.pop_front(); | ||
|
||
|
@@ -1470,6 +1560,34 @@ mod tests { | |
assert_eq!(vec.len(), 163); | ||
} | ||
|
||
#[test] | ||
fn test_skip_page_without_offset_index() { | ||
let test_file = get_test_file("alltypes_tiny_pages_plain.parquet"); | ||
|
||
let reader_result = SerializedFileReader::new(test_file); | ||
let reader = reader_result.unwrap(); | ||
|
||
let row_group_reader = reader.get_row_group(0).unwrap(); | ||
|
||
//use 'int_col', Boundary order: ASCENDING, total 325 pages. | ||
let mut column_page_reader = row_group_reader.get_column_page_reader(4).unwrap(); | ||
|
||
let mut vec = vec![]; | ||
|
||
for i in 0..325 { | ||
if i % 2 == 0 { | ||
vec.push(column_page_reader.get_next_page().unwrap().unwrap()); | ||
} else { | ||
column_page_reader.skip_next_page().unwrap(); | ||
} | ||
} | ||
//check read all pages. | ||
assert!(column_page_reader.peek_next_page().unwrap().is_none()); | ||
assert!(column_page_reader.get_next_page().unwrap().is_none()); | ||
|
||
assert_eq!(vec.len(), 163); | ||
} | ||
|
||
#[test] | ||
fn test_peek_page_with_dictionary_page() { | ||
let test_file = get_test_file("alltypes_tiny_pages.parquet"); | ||
|
@@ -1513,4 +1631,46 @@ mod tests { | |
|
||
assert_eq!(vec.len(), 352); | ||
} | ||
|
||
#[test] | ||
fn test_peek_page_with_dictionary_page_no_index() { | ||
let test_file = get_test_file("alltypes_tiny_pages.parquet"); | ||
|
||
let reader_result = SerializedFileReader::new(test_file); | ||
let reader = reader_result.unwrap(); | ||
let row_group_reader = reader.get_row_group(0).unwrap(); | ||
|
||
//use 'string_col', Boundary order: UNORDERED, total 352 data ages and 1 dictionary page. | ||
let mut column_page_reader = row_group_reader.get_column_page_reader(9).unwrap(); | ||
|
||
let mut vec = vec![]; | ||
|
||
let meta = column_page_reader.peek_next_page().unwrap().unwrap(); | ||
assert!(meta.is_dict); | ||
let page = column_page_reader.get_next_page().unwrap().unwrap(); | ||
assert!(matches!(page.page_type(), basic::PageType::DICTIONARY_PAGE)); | ||
|
||
for i in 0..352 { | ||
let meta = column_page_reader.peek_next_page().unwrap().unwrap(); | ||
// have checked with `parquet-tools column-index -c string_col ./alltypes_tiny_pages.parquet` | ||
// page meta has two scenarios(21, 20) of num_rows expect last page has 11 rows. | ||
if i != 351 { | ||
assert!((meta.num_rows == 21) || (meta.num_rows == 20)); | ||
} else { | ||
// last page first row index is 7290, total row count is 7300 | ||
// because first row start with zero, last page row count should be 10. | ||
assert_eq!(meta.num_rows, 10); | ||
} | ||
assert!(!meta.is_dict); | ||
vec.push(meta); | ||
let page = column_page_reader.get_next_page().unwrap().unwrap(); | ||
assert!(matches!(page.page_type(), basic::PageType::DATA_PAGE)); | ||
} | ||
|
||
//check read all pages. | ||
assert!(column_page_reader.peek_next_page().unwrap().is_none()); | ||
assert!(column_page_reader.get_next_page().unwrap().is_none()); | ||
|
||
assert_eq!(vec.len(), 352); | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Clippy was unhappy because this method is only used with the async feature