New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve speed of writing string dictionaries to parquet by skipping a copy(#1764) #2322
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -143,6 +143,17 @@ pub fn create_random_array( | |
}) | ||
.collect::<Result<Vec<(&str, ArrayRef)>>>()?, | ||
)?), | ||
d @ Dictionary(_, value_type) | ||
if crate::compute::can_cast_types(value_type, d) => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. using |
||
{ | ||
let f = Field::new( | ||
field.name(), | ||
value_type.as_ref().clone(), | ||
field.is_nullable(), | ||
); | ||
let v = create_random_array(&f, size, null_density, true_density)?; | ||
crate::compute::cast(&v, d)? | ||
} | ||
other => { | ||
return Err(ArrowError::NotYetImplemented(format!( | ||
"Generating random arrays not yet implemented for {:?}", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,6 @@ | |
// under the License. | ||
|
||
use crate::arrow::arrow_writer::levels::LevelInfo; | ||
use crate::arrow::arrow_writer::ArrayWriter; | ||
use crate::basic::Encoding; | ||
use crate::column::page::PageWriter; | ||
use crate::column::writer::encoder::{ | ||
|
@@ -33,11 +32,38 @@ use crate::schema::types::ColumnDescPtr; | |
use crate::util::bit_util::num_required_bits; | ||
use crate::util::interner::{Interner, Storage}; | ||
use arrow::array::{ | ||
Array, ArrayAccessor, ArrayRef, BinaryArray, LargeBinaryArray, LargeStringArray, | ||
StringArray, | ||
Array, ArrayAccessor, ArrayRef, BinaryArray, DictionaryArray, LargeBinaryArray, | ||
LargeStringArray, StringArray, | ||
}; | ||
use arrow::datatypes::DataType; | ||
|
||
macro_rules! downcast_dict_impl { | ||
($array:ident, $key:ident, $val:ident, $op:expr $(, $arg:expr)*) => {{ | ||
$op($array | ||
.as_any() | ||
.downcast_ref::<DictionaryArray<arrow::datatypes::$key>>() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When you say "could be made faster" the issue is that this is effectively creating something that will iterate over strings, which means that to encode the column, the arrow array dictionary index is used to find a string, which is then used to find the parquet array index which is then written. It could potentially be faster if we skipped the string step in the middle and simply computed an arrow dictionary index --> parquet dictionary index mapping up front and applied that mapping during writing (I think you said this in this PR's description, but I am restating it to confirm I understand what is happening) |
||
.unwrap() | ||
.downcast_dict::<$val>() | ||
.unwrap()$(, $arg)*) | ||
}}; | ||
} | ||
|
||
macro_rules! downcast_dict_op { | ||
($key_type:expr, $val:ident, $array:ident, $op:expr $(, $arg:expr)*) => { | ||
match $key_type.as_ref() { | ||
DataType::UInt8 => downcast_dict_impl!($array, UInt8Type, $val, $op$(, $arg)*), | ||
DataType::UInt16 => downcast_dict_impl!($array, UInt16Type, $val, $op$(, $arg)*), | ||
DataType::UInt32 => downcast_dict_impl!($array, UInt32Type, $val, $op$(, $arg)*), | ||
DataType::UInt64 => downcast_dict_impl!($array, UInt64Type, $val, $op$(, $arg)*), | ||
DataType::Int8 => downcast_dict_impl!($array, Int8Type, $val, $op$(, $arg)*), | ||
DataType::Int16 => downcast_dict_impl!($array, Int16Type, $val, $op$(, $arg)*), | ||
DataType::Int32 => downcast_dict_impl!($array, Int32Type, $val, $op$(, $arg)*), | ||
DataType::Int64 => downcast_dict_impl!($array, Int64Type, $val, $op$(, $arg)*), | ||
_ => unreachable!(), | ||
} | ||
}; | ||
} | ||
|
||
macro_rules! downcast_op { | ||
($data_type:expr, $array:ident, $op:expr $(, $arg:expr)*) => { | ||
match $data_type { | ||
|
@@ -51,36 +77,44 @@ macro_rules! downcast_op { | |
DataType::LargeBinary => { | ||
$op($array.as_any().downcast_ref::<LargeBinaryArray>().unwrap()$(, $arg)*) | ||
} | ||
d => unreachable!("cannot downcast {} to byte array", d) | ||
DataType::Dictionary(key, value) => match value.as_ref() { | ||
DataType::Utf8 => downcast_dict_op!(key, StringArray, $array, $op$(, $arg)*), | ||
DataType::LargeUtf8 => { | ||
downcast_dict_op!(key, LargeStringArray, $array, $op$(, $arg)*) | ||
} | ||
DataType::Binary => downcast_dict_op!(key, BinaryArray, $array, $op$(, $arg)*), | ||
DataType::LargeBinary => { | ||
downcast_dict_op!(key, LargeBinaryArray, $array, $op$(, $arg)*) | ||
} | ||
d => unreachable!("cannot downcast {} dictionary value to byte array", d), | ||
}, | ||
d => unreachable!("cannot downcast {} to byte array", d), | ||
} | ||
}; | ||
} | ||
|
||
/// Returns an [`ArrayWriter`] for byte or string arrays | ||
pub(super) fn make_byte_array_writer<'a>( | ||
descr: ColumnDescPtr, | ||
data_type: DataType, | ||
props: WriterPropertiesPtr, | ||
page_writer: Box<dyn PageWriter + 'a>, | ||
on_close: OnCloseColumnChunk<'a>, | ||
) -> Box<dyn ArrayWriter + 'a> { | ||
Box::new(ByteArrayWriter { | ||
writer: Some(GenericColumnWriter::new(descr, props, page_writer)), | ||
on_close: Some(on_close), | ||
data_type, | ||
}) | ||
} | ||
|
||
/// An [`ArrayWriter`] for [`ByteArray`] | ||
struct ByteArrayWriter<'a> { | ||
writer: Option<GenericColumnWriter<'a, ByteArrayEncoder>>, | ||
/// A writer for byte array types | ||
pub(super) struct ByteArrayWriter<'a> { | ||
writer: GenericColumnWriter<'a, ByteArrayEncoder>, | ||
on_close: Option<OnCloseColumnChunk<'a>>, | ||
data_type: DataType, | ||
} | ||
|
||
impl<'a> ArrayWriter for ByteArrayWriter<'a> { | ||
fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { | ||
self.writer.as_mut().unwrap().write_batch_internal( | ||
impl<'a> ByteArrayWriter<'a> { | ||
/// Returns a new [`ByteArrayWriter`] | ||
pub fn new( | ||
descr: ColumnDescPtr, | ||
props: &'a WriterPropertiesPtr, | ||
page_writer: Box<dyn PageWriter + 'a>, | ||
on_close: OnCloseColumnChunk<'a>, | ||
) -> Result<Self> { | ||
Ok(Self { | ||
writer: GenericColumnWriter::new(descr, props.clone(), page_writer), | ||
on_close: Some(on_close), | ||
}) | ||
} | ||
|
||
pub fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { | ||
self.writer.write_batch_internal( | ||
array, | ||
Some(levels.non_null_indices()), | ||
levels.def_levels(), | ||
|
@@ -92,11 +126,11 @@ impl<'a> ArrayWriter for ByteArrayWriter<'a> { | |
Ok(()) | ||
} | ||
|
||
fn close(&mut self) -> Result<()> { | ||
pub fn close(self) -> Result<()> { | ||
let (bytes_written, rows_written, metadata, column_index, offset_index) = | ||
self.writer.take().unwrap().close()?; | ||
self.writer.close()?; | ||
|
||
if let Some(on_close) = self.on_close.take() { | ||
if let Some(on_close) = self.on_close { | ||
on_close( | ||
bytes_written, | ||
rows_written, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,70 +33,18 @@ use super::schema::{ | |
decimal_length_from_precision, | ||
}; | ||
|
||
use crate::column::writer::{get_column_writer, ColumnWriter, ColumnWriterImpl}; | ||
use crate::arrow::arrow_writer::byte_array::ByteArrayWriter; | ||
use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; | ||
use crate::errors::{ParquetError, Result}; | ||
use crate::file::metadata::RowGroupMetaDataPtr; | ||
use crate::file::properties::WriterProperties; | ||
use crate::file::writer::{SerializedColumnWriter, SerializedRowGroupWriter}; | ||
use crate::file::writer::SerializedRowGroupWriter; | ||
use crate::{data_type::*, file::writer::SerializedFileWriter}; | ||
use levels::{calculate_array_levels, LevelInfo}; | ||
|
||
mod byte_array; | ||
mod levels; | ||
|
||
/// An object-safe API for writing an [`ArrayRef`] | ||
trait ArrayWriter { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I ended up implementing type erasure within the ByteArrayWriter, and so this indirection can be removed |
||
fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()>; | ||
|
||
fn close(&mut self) -> Result<()>; | ||
} | ||
|
||
/// Fallback implementation for writing an [`ArrayRef`] that uses [`SerializedColumnWriter`] | ||
struct ColumnArrayWriter<'a>(Option<SerializedColumnWriter<'a>>); | ||
|
||
impl<'a> ArrayWriter for ColumnArrayWriter<'a> { | ||
fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { | ||
write_leaf(self.0.as_mut().unwrap().untyped(), array, levels)?; | ||
Ok(()) | ||
} | ||
|
||
fn close(&mut self) -> Result<()> { | ||
self.0.take().unwrap().close() | ||
} | ||
} | ||
|
||
fn get_writer<'a, W: Write>( | ||
row_group_writer: &'a mut SerializedRowGroupWriter<'_, W>, | ||
data_type: &ArrowDataType, | ||
) -> Result<Box<dyn ArrayWriter + 'a>> { | ||
let array_writer = row_group_writer | ||
.next_column_with_factory( | ||
|descr, props, page_writer, on_close| match data_type { | ||
ArrowDataType::Utf8 | ||
| ArrowDataType::LargeUtf8 | ||
| ArrowDataType::Binary | ||
| ArrowDataType::LargeBinary => Ok(byte_array::make_byte_array_writer( | ||
descr, | ||
data_type.clone(), | ||
props.clone(), | ||
page_writer, | ||
on_close, | ||
)), | ||
_ => { | ||
let column_writer = | ||
get_column_writer(descr, props.clone(), page_writer); | ||
|
||
let serialized_writer = | ||
SerializedColumnWriter::new(column_writer, Some(on_close)); | ||
|
||
Ok(Box::new(ColumnArrayWriter(Some(serialized_writer)))) | ||
} | ||
}, | ||
)? | ||
.expect("Unable to get column writer"); | ||
Ok(array_writer) | ||
} | ||
|
||
/// Arrow writer | ||
/// | ||
/// Writes Arrow `RecordBatch`es to a Parquet writer, buffering up `RecordBatch` in order | ||
|
@@ -314,22 +262,24 @@ fn write_leaves<W: Write>( | |
| ArrowDataType::Time64(_) | ||
| ArrowDataType::Duration(_) | ||
| ArrowDataType::Interval(_) | ||
| ArrowDataType::LargeBinary | ||
| ArrowDataType::Binary | ||
| ArrowDataType::Utf8 | ||
| ArrowDataType::LargeUtf8 | ||
| ArrowDataType::Decimal128(_, _) | ||
| ArrowDataType::Decimal256(_, _) | ||
| ArrowDataType::FixedSizeBinary(_) => { | ||
let mut writer = get_writer(row_group_writer, &data_type)?; | ||
let mut col_writer = row_group_writer.next_column()?.unwrap(); | ||
for (array, levels) in arrays.iter().zip(levels.iter_mut()) { | ||
writer.write( | ||
array, | ||
levels.pop().expect("Levels exhausted"), | ||
)?; | ||
write_leaf(col_writer.untyped(), array, levels.pop().expect("Levels exhausted"))?; | ||
} | ||
writer.close()?; | ||
Ok(()) | ||
col_writer.close() | ||
} | ||
ArrowDataType::LargeBinary | ||
| ArrowDataType::Binary | ||
| ArrowDataType::Utf8 | ||
| ArrowDataType::LargeUtf8 => { | ||
let mut col_writer = row_group_writer.next_column_with_factory(ByteArrayWriter::new)?.unwrap(); | ||
for (array, levels) in arrays.iter().zip(levels.iter_mut()) { | ||
col_writer.write(array, levels.pop().expect("Levels exhausted"))?; | ||
} | ||
col_writer.close() | ||
} | ||
ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { | ||
let arrays: Vec<_> = arrays.iter().map(|array|{ | ||
|
@@ -380,18 +330,21 @@ fn write_leaves<W: Write>( | |
write_leaves(row_group_writer, &values, levels)?; | ||
Ok(()) | ||
} | ||
ArrowDataType::Dictionary(_, value_type) => { | ||
let mut writer = get_writer(row_group_writer, value_type)?; | ||
for (array, levels) in arrays.iter().zip(levels.iter_mut()) { | ||
// cast dictionary to a primitive | ||
let array = arrow::compute::cast(array, value_type)?; | ||
writer.write( | ||
&array, | ||
levels.pop().expect("Levels exhausted"), | ||
)?; | ||
ArrowDataType::Dictionary(_, value_type) => match value_type.as_ref() { | ||
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Binary | ArrowDataType::LargeBinary => { | ||
let mut col_writer = row_group_writer.next_column_with_factory(ByteArrayWriter::new)?.unwrap(); | ||
for (array, levels) in arrays.iter().zip(levels.iter_mut()) { | ||
col_writer.write(array, levels.pop().expect("Levels exhausted"))?; | ||
} | ||
col_writer.close() | ||
} | ||
_ => { | ||
let mut col_writer = row_group_writer.next_column()?.unwrap(); | ||
for (array, levels) in arrays.iter().zip(levels.iter_mut()) { | ||
write_leaf(col_writer.untyped(), array, levels.pop().expect("Levels exhausted"))?; | ||
} | ||
col_writer.close() | ||
} | ||
writer.close()?; | ||
Ok(()) | ||
} | ||
ArrowDataType::Float16 => Err(ParquetError::ArrowError( | ||
"Float16 arrays not supported".to_string(), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is strange that having a reference to
& V
would requireV: Clone
in order to#[derive(Clone)]
🤷There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://github.com/mzabaluev/rust-rfcs/blob/generic-derive/text/0000-generic-derive.md
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That rfc doesn't sound quite right (or at least it is a overkill -- like a 🔨 for swatting a 🪰 ). All that is needed in this case is to recognize that the struct only uses
&V
rather thanV
rather than a way to provide generic arguments to macros 😱Also, I am firmly of the belief that adding more generics is not the answer to most of life's problems 🤣 Maybe because my feeble mind can't handle the extra level of indirection