Skip to content

Commit

Permalink
feat: encode FixedSizeBinary in JSON as hex string (#5622)
Browse files Browse the repository at this point in the history
* feat: encode FixedSizeBinary in JSON as hex

Adds encoding support to the JSON writer for the FixedSizeBinary DataType

A test was added as well

* fix: properly encode fixed size binary as string

The fixed size binary values were not being encoded with surrounding
double quotes. This fixes that, and updates the added test to actually
parse the written JSON as JSON, using serde_json, and make assertions
against that.

* chore: remove unused hex dep in arrow-json

* refactor: check for null serialization of fixedsizebinary in JSON

* refactor: extend explicit nulls to the FixedSizeBinaryEncoder

Have the FixedSizeBinaryEncoder for the JSON writer handle explicit null
values, based on the Writer's EncoderOptions.

* refactor: borrow array in JSON fixed size binary encoder

Changed the FixedSizeBinaryEncoder for the JSON writer to use a borrow
of the FixedSizeBinaryArray being encoded, to follow other Encoder
implementations, and to remove the use of clone.

* refactor: remove need for clone in JSON encoder types

BooleanEncoder and StringEncoder were changed to use borrows of their
respective Array types, to avoid cloning.

* refactor: remove null handling in JSON FixedSizeBinaryEncoder

The FixedSizeBinaryEncoder does not need to handle nulls, as that will
be handled by a parent encoder, i.e., list/map.
  • Loading branch information
hiltontj committed Apr 12, 2024
1 parent 0a1dfb3 commit 6450527
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 8 deletions.
80 changes: 79 additions & 1 deletion arrow-json/src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,9 @@ mod tests {

use serde_json::json;

use arrow_array::builder::{Int32Builder, Int64Builder, MapBuilder, StringBuilder};
use arrow_array::builder::{
FixedSizeBinaryBuilder, Int32Builder, Int64Builder, MapBuilder, StringBuilder,
};
use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer, ToByteSlice};
use arrow_data::ArrayData;

Expand Down Expand Up @@ -2137,4 +2139,80 @@ mod tests {

Ok(())
}

#[test]
fn test_writer_fixed_size_binary() {
// set up schema:
let size = 11;
let schema = SchemaRef::new(Schema::new(vec![Field::new(
"bytes",
DataType::FixedSizeBinary(size),
true,
)]));

// build record batch:
let mut builder = FixedSizeBinaryBuilder::new(size);
let values = [Some(b"hello world"), None, Some(b"summer rain")];
for value in values {
match value {
Some(v) => builder.append_value(v).unwrap(),
None => builder.append_null(),
}
}
let array = Arc::new(builder.finish()) as ArrayRef;
let batch = RecordBatch::try_new(schema, vec![array]).unwrap();

// encode and check JSON with explicit nulls:
{
let mut buf = Vec::new();
let json_value: Value = {
let mut writer = WriterBuilder::new()
.with_explicit_nulls(true)
.build::<_, JsonArray>(&mut buf);
writer.write(&batch).unwrap();
writer.close().unwrap();
serde_json::from_slice(&buf).unwrap()
};

assert_eq!(
json!([
{
"bytes": "68656c6c6f20776f726c64"
},
{
"bytes": null // the explicit null
},
{
"bytes": "73756d6d6572207261696e"
}
]),
json_value,
);
}
// encode and check JSON with no explicit nulls:
{
let mut buf = Vec::new();
let json_value: Value = {
// explicit nulls are off by default, so we don't need
// to set that when creating the writer:
let mut writer = ArrayWriter::new(&mut buf);
writer.write(&batch).unwrap();
writer.close().unwrap();
serde_json::from_slice(&buf).unwrap()
};

assert_eq!(
json!([
{
"bytes": "68656c6c6f20776f726c64"
},
{}, // empty because nulls are omitted
{
"bytes": "73756d6d6572207261696e"
}
]),
json_value,
);
}
}
}
38 changes: 31 additions & 7 deletions arrow-json/src/writer/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,16 @@ fn make_encoder_impl<'a>(
DataType::Float64 => primitive_helper!(Float64Type),
DataType::Boolean => {
let array = array.as_boolean();
(Box::new(BooleanEncoder(array.clone())), array.nulls().cloned())
(Box::new(BooleanEncoder(array)), array.nulls().cloned())
}
DataType::Null => (Box::new(NullEncoder), array.logical_nulls()),
DataType::Utf8 => {
let array = array.as_string::<i32>();
(Box::new(StringEncoder(array.clone())) as _, array.nulls().cloned())
(Box::new(StringEncoder(array)) as _, array.nulls().cloned())
}
DataType::LargeUtf8 => {
let array = array.as_string::<i64>();
(Box::new(StringEncoder(array.clone())) as _, array.nulls().cloned())
(Box::new(StringEncoder(array)) as _, array.nulls().cloned())
}
DataType::List(_) => {
let array = array.as_list::<i32>();
Expand All @@ -99,6 +99,11 @@ fn make_encoder_impl<'a>(
(Box::new(MapEncoder::try_new(array, options)?) as _, array.nulls().cloned())
}

DataType::FixedSizeBinary(_) => {
let array = array.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
(Box::new(FixedSizeBinaryEncoder::new(array)) as _, array.nulls().cloned())
}

DataType::Struct(fields) => {
let array = array.as_struct();
let encoders = fields.iter().zip(array.columns()).map(|(field, array)| {
Expand Down Expand Up @@ -259,9 +264,9 @@ impl<N: PrimitiveEncode> Encoder for PrimitiveEncoder<N> {
}
}

struct BooleanEncoder(BooleanArray);
struct BooleanEncoder<'a>(&'a BooleanArray);

impl Encoder for BooleanEncoder {
impl<'a> Encoder for BooleanEncoder<'a> {
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
match self.0.value(idx) {
true => out.extend_from_slice(b"true"),
Expand All @@ -270,9 +275,9 @@ impl Encoder for BooleanEncoder {
}
}

struct StringEncoder<O: OffsetSizeTrait>(GenericStringArray<O>);
struct StringEncoder<'a, O: OffsetSizeTrait>(&'a GenericStringArray<O>);

impl<O: OffsetSizeTrait> Encoder for StringEncoder<O> {
impl<'a, O: OffsetSizeTrait> Encoder for StringEncoder<'a, O> {
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
encode_string(self.0.value(idx), out);
}
Expand Down Expand Up @@ -443,3 +448,22 @@ impl<'a> Encoder for MapEncoder<'a> {
out.push(b'}');
}
}

struct FixedSizeBinaryEncoder<'a>(&'a FixedSizeBinaryArray);

impl<'a> FixedSizeBinaryEncoder<'a> {
fn new(array: &'a FixedSizeBinaryArray) -> Self {
Self(array)
}
}

impl<'a> Encoder for FixedSizeBinaryEncoder<'a> {
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
out.push(b'"');
for byte in self.0.value(idx) {
// this write is infallible
write!(out, "{byte:02x}").unwrap();
}
out.push(b'"');
}
}

0 comments on commit 6450527

Please sign in to comment.