Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update parquet to depend on arrow subcrates #3028

Merged
merged 6 commits into from Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 12 additions & 4 deletions parquet/Cargo.toml
Expand Up @@ -30,6 +30,15 @@ edition = "2021"
rust-version = "1.62"

[dependencies]
arrow-array = { version = "26.0.0", path = "../arrow-array", default-features = false, optional = true }
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is somewhat unfortunate the number of these, perhaps we should provide re-exports to reduce this. On the flip side, parquet is a very complex crate and so perhaps it is just a bit special in needing all the things 馃槄

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe for some basic crates like arrow-buffer, arrow-data, arrow-schema, perhaps we can provide re-export (arrow-core?) for them.

Like you said, if this is just a special case, then it is fine.

arrow-buffer = { version = "26.0.0", path = "../arrow-buffer", default-features = false, optional = true }
arrow-cast = { version = "26.0.0", path = "../arrow-cast", default-features = false, optional = true }
arrow-csv = { version = "26.0.0", path = "../arrow-csv", default-features = false, optional = true }
arrow-data = { version = "26.0.0", path = "../arrow-data", default-features = false, optional = true }
arrow-schema = { version = "26.0.0", path = "../arrow-schema", default-features = false, optional = true }
arrow-select = { version = "26.0.0", path = "../arrow-select", default-features = false, optional = true }
arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", default-features = false, optional = true }

ahash = { version = "0.8", default-features = false, features = ["compile-time-rng"] }
bytes = { version = "1.1", default-features = false, features = ["std"] }
thrift = { version = "0.16", default-features = false }
Expand All @@ -41,7 +50,6 @@ zstd = { version = "0.11.1", optional = true, default-features = false }
chrono = { version = "0.4", default-features = false, features = ["alloc"] }
num = { version = "0.4", default-features = false }
num-bigint = { version = "0.4", default-features = false }
arrow = { path = "../arrow", version = "26.0.0", optional = true, default-features = false, features = ["ipc"] }
base64 = { version = "0.13", default-features = false, features = ["std"], optional = true }
clap = { version = "4", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true }
serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true }
Expand Down Expand Up @@ -70,9 +78,9 @@ all-features = true
[features]
default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"]
# Enable arrow reader/writer APIs
arrow = ["dep:arrow", "base64"]
arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", "arrow-select", "arrow-ipc"]
Comment on lines 79 to +81
Copy link
Member

@viirya viirya Nov 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems two lines can combined?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nvm, I thought if this arrow is unnecessary and we can just have these "base64" features. But I saw you use arrow as a whole feature in many places.

# Enable CLI tools
cli = ["json", "base64", "clap", "arrow/csv"]
cli = ["json", "base64", "clap", "arrow-csv"]
# Enable JSON APIs
json = ["serde_json", "base64"]
# Enable internal testing APIs
Expand Down Expand Up @@ -100,7 +108,7 @@ required-features = ["cli"]

[[bin]]
name = "parquet-fromcsv"
required-features = ["cli"]
required-features = ["arrow", "cli"]

[[bench]]
name = "arrow_writer"
Expand Down
2 changes: 1 addition & 1 deletion parquet/src/arrow/array_reader/builder.rs
Expand Up @@ -17,7 +17,7 @@

use std::sync::Arc;

use arrow::datatypes::DataType;
use arrow_schema::DataType;

use crate::arrow::array_reader::empty_array::make_empty_array_reader;
use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader;
Expand Down
8 changes: 4 additions & 4 deletions parquet/src/arrow/array_reader/byte_array.rs
Expand Up @@ -30,9 +30,9 @@ use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder};
use crate::errors::{ParquetError, Result};
use crate::schema::types::ColumnDescPtr;
use crate::util::memory::ByteBufferPtr;
use arrow::array::{Array, ArrayRef, BinaryArray, Decimal128Array, OffsetSizeTrait};
use arrow::buffer::Buffer;
use arrow::datatypes::DataType as ArrowType;
use arrow_array::{Array, ArrayRef, BinaryArray, Decimal128Array, OffsetSizeTrait};
use arrow_buffer::Buffer;
use arrow_schema::DataType as ArrowType;
use std::any::Any;
use std::ops::Range;
use std::sync::Arc;
Expand Down Expand Up @@ -587,7 +587,7 @@ mod tests {
use super::*;
use crate::arrow::array_reader::test_util::{byte_array_all_encodings, utf8_column};
use crate::arrow::record_reader::buffer::ValuesBuffer;
use arrow::array::{Array, StringArray};
use arrow_array::{Array, StringArray};

#[test]
fn test_byte_array_decoder() {
Expand Down
25 changes: 7 additions & 18 deletions parquet/src/arrow/array_reader/byte_array_dictionary.rs
Expand Up @@ -20,9 +20,9 @@ use std::marker::PhantomData;
use std::ops::Range;
use std::sync::Arc;

use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
use arrow::buffer::Buffer;
use arrow::datatypes::{ArrowNativeType, DataType as ArrowType};
use arrow_array::{Array, ArrayRef, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_schema::DataType as ArrowType;

use crate::arrow::array_reader::byte_array::{ByteArrayDecoder, ByteArrayDecoderPlain};
use crate::arrow::array_reader::{read_records, skip_records, ArrayReader};
Expand Down Expand Up @@ -188,15 +188,11 @@ where
}

fn get_def_levels(&self) -> Option<&[i16]> {
self.def_levels_buffer
.as_ref()
.map(|buf| buf.typed_data())
self.def_levels_buffer.as_ref().map(|buf| buf.typed_data())
}

fn get_rep_levels(&self) -> Option<&[i16]> {
self.rep_levels_buffer
.as_ref()
.map(|buf| buf.typed_data())
self.rep_levels_buffer.as_ref().map(|buf| buf.typed_data())
}
}

Expand Down Expand Up @@ -395,7 +391,7 @@ where

#[cfg(test)]
mod tests {
use arrow::array::{Array, StringArray};
use arrow_array::{Array, StringArray};
use arrow::compute::cast;

use crate::arrow::array_reader::test_util::{
Expand Down Expand Up @@ -528,13 +524,7 @@ mod tests {

assert_eq!(
strings.iter().collect::<Vec<_>>(),
vec![
Some("0"),
Some("1"),
Some("1"),
Some("2"),
Some("2"),
]
vec![Some("0"), Some("1"), Some("1"), Some("2"), Some("2"),]
)
}

Expand Down Expand Up @@ -625,7 +615,6 @@ mod tests {
}
}


#[test]
fn test_too_large_dictionary() {
let data: Vec<_> = (0..128)
Expand Down
5 changes: 3 additions & 2 deletions parquet/src/arrow/array_reader/empty_array.rs
Expand Up @@ -17,8 +17,9 @@

use crate::arrow::array_reader::ArrayReader;
use crate::errors::Result;
use arrow::array::{ArrayDataBuilder, ArrayRef, StructArray};
use arrow::datatypes::DataType as ArrowType;
use arrow_schema::DataType as ArrowType;
use arrow_array::{ArrayRef, StructArray};
use arrow_data::ArrayDataBuilder;
use std::any::Any;
use std::sync::Arc;

Expand Down
15 changes: 8 additions & 7 deletions parquet/src/arrow/array_reader/fixed_len_byte_array.rs
Expand Up @@ -27,12 +27,13 @@ use crate::column::reader::decoder::{ColumnValueDecoder, ValuesBufferSlice};
use crate::errors::{ParquetError, Result};
use crate::schema::types::ColumnDescPtr;
use crate::util::memory::ByteBufferPtr;
use arrow::array::{
ArrayDataBuilder, ArrayRef, Decimal128Array, FixedSizeBinaryArray,
IntervalDayTimeArray, IntervalYearMonthArray,
use arrow_array::{
ArrayRef, Decimal128Array, FixedSizeBinaryArray, IntervalDayTimeArray,
IntervalYearMonthArray,
};
use arrow::buffer::Buffer;
use arrow::datatypes::{DataType as ArrowType, IntervalUnit};
use arrow_buffer::Buffer;
use arrow_data::ArrayDataBuilder;
use arrow_schema::{DataType as ArrowType, IntervalUnit};
use std::any::Any;
use std::ops::Range;
use std::sync::Arc;
Expand Down Expand Up @@ -427,10 +428,10 @@ mod tests {
use super::*;
use crate::arrow::arrow_reader::ParquetRecordBatchReader;
use crate::arrow::ArrowWriter;
use arrow::array::{Array, Decimal128Array, ListArray};
use arrow_array::{Array, Decimal128Array, ListArray};
use arrow::datatypes::Field;
use arrow::error::Result as ArrowResult;
use arrow::record_batch::RecordBatch;
use arrow_array::RecordBatch;
use bytes::Bytes;
use std::sync::Arc;

Expand Down
16 changes: 9 additions & 7 deletions parquet/src/arrow/array_reader/list_array.rs
Expand Up @@ -18,13 +18,14 @@
use crate::arrow::array_reader::ArrayReader;
use crate::errors::ParquetError;
use crate::errors::Result;
use arrow::array::{
new_empty_array, Array, ArrayData, ArrayRef, BooleanBufferBuilder, GenericListArray,
MutableArrayData, OffsetSizeTrait,
use arrow_array::{
builder::BooleanBufferBuilder, new_empty_array, Array, ArrayRef, GenericListArray,
OffsetSizeTrait,
};
use arrow::buffer::Buffer;
use arrow::datatypes::DataType as ArrowType;
use arrow::datatypes::ToByteSlice;
use arrow_buffer::Buffer;
use arrow_buffer::ToByteSlice;
use arrow_data::{transform::MutableArrayData, ArrayData};
use arrow_schema::DataType as ArrowType;
use std::any::Any;
use std::cmp::Ordering;
use std::marker::PhantomData;
Expand Down Expand Up @@ -257,8 +258,9 @@ mod tests {
use crate::file::reader::{FileReader, SerializedFileReader};
use crate::schema::parser::parse_message_type;
use crate::schema::types::SchemaDescriptor;
use arrow::array::{Array, ArrayDataBuilder, PrimitiveArray};
use arrow::datatypes::{Field, Int32Type as ArrowInt32, Int32Type};
use arrow_array::{Array, PrimitiveArray};
use arrow_data::ArrayDataBuilder;
use std::sync::Arc;

fn list_type<OffsetSize: OffsetSizeTrait>(
Expand Down
16 changes: 8 additions & 8 deletions parquet/src/arrow/array_reader/map_array.rs
Expand Up @@ -17,8 +17,8 @@

use crate::arrow::array_reader::{ArrayReader, ListArrayReader, StructArrayReader};
use crate::errors::Result;
use arrow::array::{Array, ArrayRef, MapArray};
use arrow::datatypes::DataType as ArrowType;
use arrow_array::{Array, ArrayRef, MapArray};
use arrow_schema::DataType as ArrowType;
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -125,10 +125,10 @@ mod tests {
use super::*;
use crate::arrow::arrow_reader::ParquetRecordBatchReader;
use crate::arrow::ArrowWriter;
use arrow::array;
use arrow::array::{MapBuilder, PrimitiveBuilder, StringBuilder};
use arrow::datatypes::{Field, Int32Type, Schema};
use arrow::record_batch::RecordBatch;
use arrow_array::builder::{MapBuilder, PrimitiveBuilder, StringBuilder};
use arrow_array::cast::*;
use arrow_array::RecordBatch;
use bytes::Bytes;

#[test]
Expand Down Expand Up @@ -203,9 +203,9 @@ mod tests {
let col = record_batch.column(0);
assert!(col.is_null(0));
assert!(col.is_null(1));
let map_entry = array::as_map_array(col).value(2);
let struct_col = array::as_struct_array(&map_entry);
let key_col = array::as_string_array(struct_col.column(0)); // Key column
let map_entry = as_map_array(col).value(2);
let struct_col = as_struct_array(&map_entry);
let key_col = as_string_array(struct_col.column(0)); // Key column
assert_eq!(key_col.value(0), "three");
assert_eq!(key_col.value(1), "four");
assert_eq!(key_col.value(2), "five");
Expand Down
4 changes: 2 additions & 2 deletions parquet/src/arrow/array_reader/mod.rs
Expand Up @@ -18,8 +18,8 @@
//! Logic for reading into arrow arrays

use crate::errors::Result;
use arrow::array::ArrayRef;
use arrow::datatypes::DataType as ArrowType;
use arrow_array::ArrayRef;
use arrow_schema::DataType as ArrowType;
use std::any::Any;
use std::sync::Arc;

Expand Down
8 changes: 4 additions & 4 deletions parquet/src/arrow/array_reader/null_array.rs
Expand Up @@ -22,9 +22,9 @@ use crate::column::page::PageIterator;
use crate::data_type::DataType;
use crate::errors::Result;
use crate::schema::types::ColumnDescPtr;
use arrow::array::ArrayRef;
use arrow::buffer::Buffer;
use arrow::datatypes::DataType as ArrowType;
use arrow_array::ArrayRef;
use arrow_buffer::Buffer;
use arrow_schema::DataType as ArrowType;
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -82,7 +82,7 @@ where

fn consume_batch(&mut self) -> Result<ArrayRef> {
// convert to arrays
let array = arrow::array::NullArray::new(self.record_reader.num_values());
let array = arrow_array::NullArray::new(self.record_reader.num_values());

// save definition and repetition buffers
self.def_levels_buffer = self.record_reader.consume_def_levels();
Expand Down
21 changes: 11 additions & 10 deletions parquet/src/arrow/array_reader/primitive_array.rs
Expand Up @@ -24,13 +24,14 @@ use crate::column::page::PageIterator;
use crate::data_type::{DataType, Int96};
use crate::errors::{ParquetError, Result};
use crate::schema::types::ColumnDescPtr;
use arrow::array::{
ArrayDataBuilder, ArrayRef, BooleanArray, BooleanBufferBuilder, Decimal128Array,
Float32Array, Float64Array, Int32Array, Int64Array, TimestampNanosecondArray,
TimestampNanosecondBufferBuilder, UInt32Array, UInt64Array,
use arrow_array::{
builder::{BooleanBufferBuilder, TimestampNanosecondBufferBuilder},
ArrayRef, BooleanArray, Decimal128Array, Float32Array, Float64Array, Int32Array,
Int64Array, TimestampNanosecondArray, UInt32Array, UInt64Array,
};
use arrow::buffer::Buffer;
use arrow::datatypes::{DataType as ArrowType, TimeUnit};
use arrow_buffer::Buffer;
use arrow_data::ArrayDataBuilder;
use arrow_schema::{DataType as ArrowType, TimeUnit};
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -205,8 +206,8 @@ where
let array = match target_type {
ArrowType::Date64 => {
// this is cheap as it internally reinterprets the data
let a = arrow::compute::cast(&array, &ArrowType::Date32)?;
arrow::compute::cast(&a, target_type)?
let a = arrow_cast::cast(&array, &ArrowType::Date32)?;
arrow_cast::cast(&a, target_type)?
}
ArrowType::Decimal128(p, s) => {
let array = match array.data_type() {
Expand Down Expand Up @@ -236,7 +237,7 @@ where

Arc::new(array) as ArrayRef
}
_ => arrow::compute::cast(&array, target_type)?,
_ => arrow_cast::cast(&array, target_type)?,
};

// save definition and repetition buffers
Expand Down Expand Up @@ -270,8 +271,8 @@ mod tests {
use crate::schema::types::SchemaDescriptor;
use crate::util::test_common::rand_gen::make_pages;
use crate::util::InMemoryPageIterator;
use arrow::array::{Array, PrimitiveArray};
use arrow::datatypes::ArrowPrimitiveType;
use arrow_array::{Array, PrimitiveArray};

use arrow::datatypes::DataType::Decimal128;
use rand::distributions::uniform::SampleUniform;
Expand Down
9 changes: 4 additions & 5 deletions parquet/src/arrow/array_reader/struct_array.rs
Expand Up @@ -17,10 +17,9 @@

use crate::arrow::array_reader::ArrayReader;
use crate::errors::{ParquetError, Result};
use arrow::array::{
ArrayData, ArrayDataBuilder, ArrayRef, BooleanBufferBuilder, StructArray,
};
use arrow::datatypes::DataType as ArrowType;
use arrow_array::{builder::BooleanBufferBuilder, ArrayRef, StructArray};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType as ArrowType;
use std::any::Any;
use std::sync::Arc;

Expand Down Expand Up @@ -216,9 +215,9 @@ mod tests {
use super::*;
use crate::arrow::array_reader::test_util::InMemoryArrayReader;
use crate::arrow::array_reader::ListArrayReader;
use arrow::array::{Array, Int32Array, ListArray};
use arrow::buffer::Buffer;
use arrow::datatypes::Field;
use arrow_array::{Array, Int32Array, ListArray};

#[test]
fn test_struct_array_reader() {
Expand Down
4 changes: 2 additions & 2 deletions parquet/src/arrow/array_reader/test_util.rs
Expand Up @@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use arrow::array::{Array, ArrayRef};
use arrow::datatypes::DataType as ArrowType;
use arrow_array::{Array, ArrayRef};
use arrow_schema::DataType as ArrowType;
use std::any::Any;
use std::sync::Arc;

Expand Down