Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more tests of RecordReader Batch Size Edge Cases (#2025) #2032

Merged
merged 2 commits into from
Jul 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
35 changes: 26 additions & 9 deletions parquet/src/arrow/arrow_reader.rs
Expand Up @@ -1529,8 +1529,7 @@ mod tests {
assert_eq!(total_rows, expected_rows);
}

#[test]
fn test_row_group_exact_multiple() {
fn test_row_group_batch(row_group_size: usize, batch_size: usize) {
let schema = Arc::new(Schema::new(vec![Field::new(
"list",
ArrowDataType::List(Box::new(Field::new("item", ArrowDataType::Int32, true))),
Expand All @@ -1544,14 +1543,14 @@ mod tests {
schema.clone(),
Some(
WriterProperties::builder()
.set_max_row_group_size(8)
.set_max_row_group_size(row_group_size)
.build(),
),
)
.unwrap();
for _ in 0..2 {
let mut list_builder = ListBuilder::new(Int32Builder::new(10));
for _ in 0..10 {
let mut list_builder = ListBuilder::new(Int32Builder::new(batch_size));
for _ in 0..(batch_size) {
list_builder.append(true).unwrap();
}
let batch = RecordBatch::try_new(
Expand All @@ -1564,9 +1563,27 @@ mod tests {
writer.close().unwrap();

let mut file_reader = ParquetFileArrowReader::try_new(Bytes::from(buf)).unwrap();
let mut record_reader = file_reader.get_record_reader(8).unwrap();
assert_eq!(8, record_reader.next().unwrap().unwrap().num_rows());
assert_eq!(8, record_reader.next().unwrap().unwrap().num_rows());
assert_eq!(4, record_reader.next().unwrap().unwrap().num_rows());
let mut record_reader = file_reader.get_record_reader(batch_size).unwrap();
assert_eq!(
batch_size,
record_reader.next().unwrap().unwrap().num_rows()
);
assert_eq!(
batch_size,
record_reader.next().unwrap().unwrap().num_rows()
);
}

#[test]
fn test_row_group_exact_multiple() {
use crate::arrow::record_reader::MIN_BATCH_SIZE;
test_row_group_batch(8, 8);
tustvold marked this conversation as resolved.
Show resolved Hide resolved
test_row_group_batch(10, 8);
test_row_group_batch(8, 10);
test_row_group_batch(MIN_BATCH_SIZE, MIN_BATCH_SIZE);
test_row_group_batch(MIN_BATCH_SIZE + 1, MIN_BATCH_SIZE);
test_row_group_batch(MIN_BATCH_SIZE, MIN_BATCH_SIZE + 1);
test_row_group_batch(MIN_BATCH_SIZE, MIN_BATCH_SIZE - 1);
test_row_group_batch(MIN_BATCH_SIZE - 1, MIN_BATCH_SIZE);
}
}
3 changes: 2 additions & 1 deletion parquet/src/arrow/record_reader/mod.rs
Expand Up @@ -38,7 +38,8 @@ use crate::schema::types::ColumnDescPtr;
pub(crate) mod buffer;
mod definition_levels;

const MIN_BATCH_SIZE: usize = 1024;
/// The minimum number of levels read when reading a repeated field
pub(crate) const MIN_BATCH_SIZE: usize = 1024;

/// A `RecordReader` is a stateful column reader that delimits semantic records.
pub type RecordReader<T> =
Expand Down