From 8d75101e3773e5d74c8d5cda356a9eaba34acf90 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Wed, 9 Nov 2022 18:10:24 +1300 Subject: [PATCH] Split out arrow-json (#3044) (#3049) * Split out arrow-json (#3044) * RAT * Fix feature * Revert no_run * RAT --- .github/workflows/arrow.yml | 5 + .github/workflows/arrow_flight.yml | 1 + .github/workflows/dev_pr/labeler.yml | 1 + .github/workflows/integration.yml | 1 + .github/workflows/miri.yaml | 1 + .github/workflows/parquet.yml | 1 + Cargo.toml | 3 +- arrow-json/Cargo.toml | 54 +++++++ .../src/json/mod.rs => arrow-json/src/lib.rs | 0 {arrow/src/json => arrow-json/src}/reader.rs | 136 +++++++++--------- {arrow/src/json => arrow-json/src}/writer.rs | 98 +++++++------ {arrow => arrow-json}/test/data/arrays.json | 0 {arrow => arrow-json}/test/data/basic.json | 0 .../test/data/basic_nulls.json | 0 .../test/data/list_string_dict_nested.json | 0 .../data/list_string_dict_nested_nulls.json | 0 .../test/data/mixed_arrays.json | 0 .../test/data/mixed_arrays.json.gz | Bin .../test/data/nested_structs.json | 0 arrow/Cargo.toml | 18 ++- arrow/benches/json_reader.rs | 5 +- arrow/src/lib.rs | 6 +- dev/release/rat_exclude_files.txt | 1 + 23 files changed, 194 insertions(+), 137 deletions(-) create mode 100644 arrow-json/Cargo.toml rename arrow/src/json/mod.rs => arrow-json/src/lib.rs (100%) rename {arrow/src/json => arrow-json/src}/reader.rs (97%) rename {arrow/src/json => arrow-json/src}/writer.rs (95%) rename {arrow => arrow-json}/test/data/arrays.json (100%) rename {arrow => arrow-json}/test/data/basic.json (100%) rename {arrow => arrow-json}/test/data/basic_nulls.json (100%) rename {arrow => arrow-json}/test/data/list_string_dict_nested.json (100%) rename {arrow => arrow-json}/test/data/list_string_dict_nested_nulls.json (100%) rename {arrow => arrow-json}/test/data/mixed_arrays.json (100%) rename {arrow => arrow-json}/test/data/mixed_arrays.json.gz (100%) rename {arrow => arrow-json}/test/data/nested_structs.json (100%) diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index 461e7e87ea5..2e1c64ebe3a 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -35,6 +35,7 @@ on: - arrow-integration-test/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - .github/** jobs: @@ -67,6 +68,8 @@ jobs: run: cargo test -p arrow-ipc --all-features - name: Test arrow-csv with all features run: cargo test -p arrow-csv --all-features + - name: Test arrow-json with all features + run: cargo test -p arrow-json --all-features - name: Test arrow-integration-test with all features run: cargo test -p arrow-integration-test --all-features - name: Test arrow with default features @@ -179,5 +182,7 @@ jobs: run: cargo clippy -p arrow-ipc --all-targets --all-features -- -D warnings - name: Clippy arrow-csv with all features run: cargo clippy -p arrow-csv --all-targets --all-features -- -D warnings + - name: Clippy arrow-json with all features + run: cargo clippy -p arrow-json --all-targets --all-features -- -D warnings - name: Clippy arrow run: cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression,dyn_cmp_dict,dyn_arith_dict,chrono-tz --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 1f830ccf2b2..2825d2400f1 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -37,6 +37,7 @@ on: - arrow-flight/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - .github/** jobs: diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 04c7c080e01..d93932cd233 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -25,6 +25,7 @@ arrow: - arrow-select/**/* - arrow-ipc/**/* - arrow-csv/**/* + - arrow-json/**/* arrow-flight: - arrow-flight/**/* diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9418b904283..3ece06b2923 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -33,6 +33,7 @@ on: - arrow-select/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - arrow-pyarrow-integration-testing/** - arrow-integration-test/** - arrow-integration-testing/** diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index e58ebdb3569..b1f5d85fc58 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -33,6 +33,7 @@ on: - arrow-select/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - .github/** jobs: diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 4f3cf5f8005..5b0cc87440e 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -36,6 +36,7 @@ on: - arrow-select/** - arrow-ipc/** - arrow-csv/** + - arrow-json/** - parquet/** - .github/** diff --git a/Cargo.toml b/Cargo.toml index 18497d04379..16b4cb7f89e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,12 +27,13 @@ members = [ "arrow-integration-test", "arrow-integration-testing", "arrow-ipc", + "arrow-json", "arrow-schema", "arrow-select", + "object_store", "parquet", "parquet_derive", "parquet_derive_test", - "object_store", ] # Enable the version 2 feature resolver, which avoids unifying features for targets that are not being built # diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml new file mode 100644 index 00000000000..0d8c9109210 --- /dev/null +++ b/arrow-json/Cargo.toml @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "arrow-json" +version = "26.0.0" +description = "Support for parsing JSON format into the Arrow format" +homepage = "https://github.com/apache/arrow-rs" +repository = "https://github.com/apache/arrow-rs" +authors = ["Apache Arrow "] +license = "Apache-2.0" +keywords = ["arrow"] +include = [ + "benches/*.rs", + "src/**/*.rs", + "Cargo.toml", +] +edition = "2021" +rust-version = "1.62" + +[lib] +name = "arrow_json" +path = "src/lib.rs" +bench = false + +[dependencies] +arrow-array = { version = "26.0.0", path = "../arrow-array" } +arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } +arrow-cast = { version = "26.0.0", path = "../arrow-cast" } +arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-schema = { version = "26.0.0", path = "../arrow-schema" } +half = { version = "2.1", default-features = false } +indexmap = { version = "1.9", default-features = false, features = ["std"] } +num = { version = "0.4", default-features = false, features = ["std"] } +serde_json = { version = "1.0", default-features = false, features = ["std"] } +chrono = { version = "0.4", default-features = false, features = ["clock"] } + +[dev-dependencies] +tempfile = "3.3" +flate2 = { version = "1", default-features = false, features = ["rust_backend"] } diff --git a/arrow/src/json/mod.rs b/arrow-json/src/lib.rs similarity index 100% rename from arrow/src/json/mod.rs rename to arrow-json/src/lib.rs diff --git a/arrow/src/json/reader.rs b/arrow-json/src/reader.rs similarity index 97% rename from arrow/src/json/reader.rs rename to arrow-json/src/reader.rs index 78c51559a7d..b3af909ef46 100644 --- a/arrow/src/json/reader.rs +++ b/arrow-json/src/reader.rs @@ -24,11 +24,10 @@ //! Example: //! //! ``` -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use arrow::json; -//! use std::fs::File; -//! use std::io::BufReader; -//! use std::sync::Arc; +//! # use arrow_schema::*; +//! # use std::fs::File; +//! # use std::io::BufReader; +//! # use std::sync::Arc; //! //! let schema = Schema::new(vec![ //! Field::new("a", DataType::Float64, false), @@ -38,10 +37,10 @@ //! //! let file = File::open("test/data/basic.json").unwrap(); //! -//! let mut json = json::Reader::new( +//! let mut json = arrow_json::Reader::new( //! BufReader::new(file), //! Arc::new(schema), -//! json::reader::DecoderOptions::new(), +//! arrow_json::reader::DecoderOptions::new(), //! ); //! //! let batch = json.next().unwrap().unwrap(); @@ -55,13 +54,13 @@ use indexmap::set::IndexSet as HashSet; use serde_json::json; use serde_json::{map::Map as JsonMap, Value}; -use crate::buffer::MutableBuffer; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::record_batch::{RecordBatch, RecordBatchOptions}; -use crate::util::bit_util; -use crate::{array::*, buffer::Buffer}; +use arrow_array::builder::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_buffer::{bit_util, Buffer, MutableBuffer}; use arrow_cast::parse::Parser; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::*; #[derive(Debug, Clone)] enum InferredType { @@ -72,7 +71,7 @@ enum InferredType { } impl InferredType { - fn merge(&mut self, other: InferredType) -> Result<()> { + fn merge(&mut self, other: InferredType) -> Result<(), ArrowError> { match (self, other) { (InferredType::Array(s), InferredType::Array(o)) => { s.merge(*o)?; @@ -147,7 +146,7 @@ fn coerce_data_type(dt: Vec<&DataType>) -> DataType { }) } -fn generate_datatype(t: &InferredType) -> Result { +fn generate_datatype(t: &InferredType) -> Result { Ok(match t { InferredType::Scalar(hs) => coerce_data_type(hs.iter().collect()), InferredType::Object(spec) => DataType::Struct(generate_fields(spec)?), @@ -160,14 +159,16 @@ fn generate_datatype(t: &InferredType) -> Result { }) } -fn generate_fields(spec: &HashMap) -> Result> { +fn generate_fields( + spec: &HashMap, +) -> Result, ArrowError> { spec.iter() .map(|(k, types)| Ok(Field::new(k, generate_datatype(types)?, true))) .collect() } /// Generate schema from JSON field names and inferred data types -fn generate_schema(spec: HashMap) -> Result { +fn generate_schema(spec: HashMap) -> Result { Ok(Schema::new(generate_fields(&spec)?)) } @@ -178,7 +179,7 @@ fn generate_schema(spec: HashMap) -> Result { /// ``` /// use std::fs::File; /// use std::io::BufReader; -/// use arrow::json::reader::ValueIter; +/// use arrow_json::reader::ValueIter; /// /// let mut reader = /// BufReader::new(File::open("test/data/mixed_arrays.json").unwrap()); @@ -208,7 +209,7 @@ impl<'a, R: Read> ValueIter<'a, R> { } impl<'a, R: Read> Iterator for ValueIter<'a, R> { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { if let Some(max) = self.max_read_records { @@ -259,7 +260,7 @@ impl<'a, R: Read> Iterator for ValueIter<'a, R> { /// ``` /// use std::fs::File; /// use std::io::BufReader; -/// use arrow::json::reader::infer_json_schema_from_seekable; +/// use arrow_json::reader::infer_json_schema_from_seekable; /// /// let file = File::open("test/data/mixed_arrays.json").unwrap(); /// // file's cursor's offset at 0 @@ -270,7 +271,7 @@ impl<'a, R: Read> Iterator for ValueIter<'a, R> { pub fn infer_json_schema_from_seekable( reader: &mut BufReader, max_read_records: Option, -) -> Result { +) -> Result { let schema = infer_json_schema(reader, max_read_records); // return the reader seek back to the start reader.seek(SeekFrom::Start(0))?; @@ -292,7 +293,7 @@ pub fn infer_json_schema_from_seekable( /// use std::fs::File; /// use std::io::{BufReader, SeekFrom, Seek}; /// use flate2::read::GzDecoder; -/// use arrow::json::reader::infer_json_schema; +/// use arrow_json::reader::infer_json_schema; /// /// let mut file = File::open("test/data/mixed_arrays.json.gz").unwrap(); /// @@ -307,7 +308,7 @@ pub fn infer_json_schema_from_seekable( pub fn infer_json_schema( reader: &mut BufReader, max_read_records: Option, -) -> Result { +) -> Result { infer_json_schema_from_iterator(ValueIter::new(reader, max_read_records)) } @@ -315,7 +316,7 @@ fn set_object_scalar_field_type( field_types: &mut HashMap, key: &str, ftype: DataType, -) -> Result<()> { +) -> Result<(), ArrowError> { if !field_types.contains_key(key) { field_types.insert(key.to_string(), InferredType::Scalar(HashSet::new())); } @@ -340,7 +341,7 @@ fn set_object_scalar_field_type( } } -fn infer_scalar_array_type(array: &[Value]) -> Result { +fn infer_scalar_array_type(array: &[Value]) -> Result { let mut hs = HashSet::new(); for v in array { @@ -371,7 +372,7 @@ fn infer_scalar_array_type(array: &[Value]) -> Result { Ok(InferredType::Scalar(hs)) } -fn infer_nested_array_type(array: &[Value]) -> Result { +fn infer_nested_array_type(array: &[Value]) -> Result { let mut inner_ele_type = InferredType::Any; for v in array { @@ -391,7 +392,7 @@ fn infer_nested_array_type(array: &[Value]) -> Result { Ok(InferredType::Array(Box::new(inner_ele_type))) } -fn infer_struct_array_type(array: &[Value]) -> Result { +fn infer_struct_array_type(array: &[Value]) -> Result { let mut field_types = HashMap::new(); for v in array { @@ -411,7 +412,7 @@ fn infer_struct_array_type(array: &[Value]) -> Result { Ok(InferredType::Object(field_types)) } -fn infer_array_element_type(array: &[Value]) -> Result { +fn infer_array_element_type(array: &[Value]) -> Result { match array.iter().take(1).next() { None => Ok(InferredType::Any), // empty array, return any type that can be updated later Some(a) => match a { @@ -425,7 +426,7 @@ fn infer_array_element_type(array: &[Value]) -> Result { fn collect_field_types_from_object( field_types: &mut HashMap, map: &JsonMap, -) -> Result<()> { +) -> Result<(), ArrowError> { for (k, v) in map { match v { Value::Array(array) => { @@ -532,9 +533,9 @@ fn collect_field_types_from_object( /// The reason we diverge here is because we don't have utilities to deal with JSON data once it's /// interpreted as Strings. We should match Spark's behavior once we added more JSON parsing /// kernels in the future. -pub fn infer_json_schema_from_iterator(value_iter: I) -> Result +pub fn infer_json_schema_from_iterator(value_iter: I) -> Result where - I: Iterator>, + I: Iterator>, { let mut field_types: HashMap = HashMap::new(); @@ -563,7 +564,7 @@ where /// /// # Examples /// ``` -/// use arrow::json::reader::{Decoder, DecoderOptions, ValueIter, infer_json_schema}; +/// use arrow_json::reader::{Decoder, DecoderOptions, ValueIter, infer_json_schema}; /// use std::fs::File; /// use std::io::{BufReader, Seek, SeekFrom}; /// use std::sync::Arc; @@ -673,9 +674,12 @@ impl Decoder { /// interator into a [`RecordBatch`]. /// /// Returns `None` if the input iterator is exhausted. - pub fn next_batch(&self, value_iter: &mut I) -> Result> + pub fn next_batch( + &self, + value_iter: &mut I, + ) -> Result, ArrowError> where - I: Iterator>, + I: Iterator>, { let batch_size = self.options.batch_size; let mut rows: Vec = Vec::with_capacity(batch_size); @@ -732,7 +736,7 @@ impl Decoder { rows: &[Value], col_name: &str, key_type: &DataType, - ) -> Result { + ) -> Result { match *key_type { DataType::Int8 => { let dtype = DataType::Dictionary( @@ -803,7 +807,7 @@ impl Decoder { data_type: &DataType, col_name: &str, rows: &[Value], - ) -> Result + ) -> Result where DT: ArrowPrimitiveType + ArrowDictionaryKeyType, { @@ -923,7 +927,7 @@ impl Decoder { col_name: &str, key_type: &DataType, value_type: &DataType, - ) -> Result { + ) -> Result { if let DataType::Utf8 = *value_type { match *key_type { DataType::Int8 => self.build_dictionary_array::(rows, col_name), @@ -959,7 +963,11 @@ impl Decoder { } } - fn build_boolean_array(&self, rows: &[Value], col_name: &str) -> Result { + fn build_boolean_array( + &self, + rows: &[Value], + col_name: &str, + ) -> Result { let mut builder = BooleanBuilder::with_capacity(rows.len()); for row in rows { if let Some(value) = row.get(col_name) { @@ -980,9 +988,9 @@ impl Decoder { &self, rows: &[Value], col_name: &str, - ) -> Result + ) -> Result where - T: ArrowNumericType, + T: ArrowPrimitiveType, T::Native: num::NumCast, { let format_string = self @@ -1019,7 +1027,7 @@ impl Decoder { &self, rows: &[Value], list_field: &Field, - ) -> Result { + ) -> Result { // build list offsets let mut cur_offset = OffsetSize::zero(); let list_len = rows.len(); @@ -1188,8 +1196,8 @@ impl Decoder { rows: &[Value], struct_fields: &[Field], projection: &Option>, - ) -> Result> { - let arrays: Result> = struct_fields + ) -> Result, ArrowError> { + let arrays: Result, ArrowError> = struct_fields .iter() .filter(|field| { projection @@ -1393,7 +1401,7 @@ impl Decoder { field_name: &str, map_type: &DataType, struct_field: &Field, - ) -> Result { + ) -> Result { // A map has the format {"key": "value"} where key is most commonly a string, // but could be a string, number or boolean (🤷🏾‍♂️) (e.g. {1: "value"}). // A map is also represented as a flattened contiguous array, with the number @@ -1488,7 +1496,7 @@ impl Decoder { &self, rows: &[Value], col_name: &str, - ) -> Result + ) -> Result where T::Native: num::NumCast, T: ArrowPrimitiveType + ArrowDictionaryKeyType, @@ -1512,7 +1520,7 @@ impl Decoder { /// Read the primitive list's values into ArrayData fn read_primitive_list_values(&self, rows: &[Value]) -> ArrayData where - T: ArrowPrimitiveType + ArrowNumericType, + T: ArrowPrimitiveType, T::Native: num::NumCast, { let values = rows @@ -1637,7 +1645,7 @@ impl Reader { /// Read the next batch of records #[allow(clippy::should_implement_trait)] - pub fn next(&mut self) -> Result> { + pub fn next(&mut self) -> Result, ArrowError> { self.decoder .next_batch(&mut ValueIter::new(&mut self.reader, None)) } @@ -1667,16 +1675,13 @@ impl ReaderBuilder { /// # Example /// /// ``` - /// extern crate arrow; - /// - /// use arrow::json; - /// use std::fs::File; + /// # use std::fs::File; /// - /// fn example() -> json::Reader { + /// fn example() -> arrow_json::Reader { /// let file = File::open("test/data/basic.json").unwrap(); /// /// // create a builder, inferring the schema with the first 100 records - /// let builder = json::ReaderBuilder::new().infer_schema(Some(100)); + /// let builder = arrow_json::ReaderBuilder::new().infer_schema(Some(100)); /// /// let reader = builder.build::(file).unwrap(); /// @@ -1723,7 +1728,7 @@ impl ReaderBuilder { } /// Create a new `Reader` from the `ReaderBuilder` - pub fn build(self, source: R) -> Result> + pub fn build(self, source: R) -> Result, ArrowError> where R: Read + Seek, { @@ -1743,7 +1748,7 @@ impl ReaderBuilder { } impl Iterator for Reader { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { self.next().transpose() @@ -1752,12 +1757,9 @@ impl Iterator for Reader { #[cfg(test)] mod tests { - use crate::{ - buffer::Buffer, - datatypes::DataType::{Dictionary, List}, - }; - use super::*; + use arrow_buffer::ToByteSlice; + use arrow_schema::DataType::{Dictionary, List}; use flate2::read::GzDecoder; use std::fs::File; use std::io::Cursor; @@ -2076,12 +2078,8 @@ mod tests { #[test] fn test_invalid_json_infer_schema() { - let re = infer_json_schema_from_seekable( - &mut BufReader::new( - File::open("test/data/uk_cities_with_headers.csv").unwrap(), - ), - None, - ); + let re = + infer_json_schema_from_seekable(&mut BufReader::new(Cursor::new(b"}")), None); assert_eq!( re.err().unwrap().to_string(), "Json error: Not valid JSON: expected value at line 1 column 1", @@ -2096,9 +2094,7 @@ mod tests { true, )])); let builder = ReaderBuilder::new().with_schema(schema).with_batch_size(64); - let mut reader: Reader = builder - .build::(File::open("test/data/uk_cities_with_headers.csv").unwrap()) - .unwrap(); + let mut reader = builder.build(Cursor::new(b"}")).unwrap(); assert_eq!( reader.next().err().unwrap().to_string(), "Json error: Not valid JSON: expected value at line 1 column 1", @@ -2107,7 +2103,7 @@ mod tests { #[test] fn test_coersion_scalar_and_list() { - use crate::datatypes::DataType::*; + use arrow_schema::DataType::*; assert_eq!( List(Box::new(Field::new("item", Float64, true))), diff --git a/arrow/src/json/writer.rs b/arrow-json/src/writer.rs similarity index 95% rename from arrow/src/json/writer.rs rename to arrow-json/src/writer.rs index f622b0cce77..69f62660039 100644 --- a/arrow/src/json/writer.rs +++ b/arrow-json/src/writer.rs @@ -27,18 +27,15 @@ //! [`record_batches_to_json_rows`]: //! //! ``` -//! use std::sync::Arc; -//! -//! use arrow::array::Int32Array; -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use arrow::json; -//! use arrow::record_batch::RecordBatch; +//! # use std::sync::Arc; +//! # use arrow_array::{Int32Array, RecordBatch}; +//! # use arrow_schema::{DataType, Field, Schema}; //! //! let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); //! let a = Int32Array::from(vec![1, 2, 3]); //! let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); //! -//! let json_rows = json::writer::record_batches_to_json_rows(&[batch]).unwrap(); +//! let json_rows = arrow_json::writer::record_batches_to_json_rows(&[batch]).unwrap(); //! assert_eq!( //! serde_json::Value::Object(json_rows[1].clone()), //! serde_json::json!({"a": 2}), @@ -51,12 +48,9 @@ //! [`LineDelimitedWriter`]: //! //! ``` -//! use std::sync::Arc; -//! -//! use arrow::array::Int32Array; -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use arrow::json; -//! use arrow::record_batch::RecordBatch; +//! # use std::sync::Arc; +//! # use arrow_array::{Int32Array, RecordBatch}; +//! # use arrow_schema::{DataType, Field, Schema}; //! //! let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); //! let a = Int32Array::from(vec![1, 2, 3]); @@ -64,7 +58,7 @@ //! //! // Write the record batch out as JSON //! let buf = Vec::new(); -//! let mut writer = json::LineDelimitedWriter::new(buf); +//! let mut writer = arrow_json::LineDelimitedWriter::new(buf); //! writer.write_batches(&vec![batch]).unwrap(); //! writer.finish().unwrap(); //! @@ -80,12 +74,9 @@ //! [`ArrayWriter`]: //! //! ``` -//! use std::sync::Arc; -//! -//! use arrow::array::Int32Array; -//! use arrow::datatypes::{DataType, Field, Schema}; -//! use arrow::json; -//! use arrow::record_batch::RecordBatch; +//! # use std::sync::Arc; +//! # use arrow_array::{Int32Array, RecordBatch}; +//! use arrow_schema::{DataType, Field, Schema}; //! //! let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); //! let a = Int32Array::from(vec![1, 2, 3]); @@ -93,7 +84,7 @@ //! //! // Write the record batch out as a JSON array //! let buf = Vec::new(); -//! let mut writer = json::ArrayWriter::new(buf); +//! let mut writer = arrow_json::ArrayWriter::new(buf); //! writer.write_batches(&vec![batch]).unwrap(); //! writer.finish().unwrap(); //! @@ -108,13 +99,13 @@ use std::{fmt::Debug, io::Write}; use serde_json::map::Map as JsonMap; use serde_json::Value; -use crate::array::*; -use crate::datatypes::*; -use crate::error::{ArrowError, Result}; -use crate::json::JsonSerializable; -use crate::record_batch::RecordBatch; +use crate::JsonSerializable; +use arrow_array::cast::*; +use arrow_array::types::*; +use arrow_array::*; +use arrow_schema::*; -fn primitive_array_to_json(array: &ArrayRef) -> Result> +fn primitive_array_to_json(array: &ArrayRef) -> Result, ArrowError> where T: ArrowPrimitiveType, T::Native: JsonSerializable, @@ -131,7 +122,7 @@ where fn struct_array_to_jsonmap_array( array: &StructArray, row_count: usize, -) -> Result>> { +) -> Result>, ArrowError> { let inner_col_names = array.column_names(); let mut inner_objs = iter::repeat(JsonMap::new()) @@ -150,7 +141,7 @@ fn struct_array_to_jsonmap_array( } /// Converts an arrow [`ArrayRef`] into a `Vec` of Serde JSON [`serde_json::Value`]'s -pub fn array_to_json_array(array: &ArrayRef) -> Result> { +pub fn array_to_json_array(array: &ArrayRef) -> Result, ArrowError> { match array.data_type() { DataType::Null => Ok(iter::repeat(Value::Null).take(array.len()).collect()), DataType::Boolean => Ok(as_boolean_array(array) @@ -269,7 +260,7 @@ fn set_column_for_json_rows( row_count: usize, array: &ArrayRef, col_name: &str, -) -> Result<()> { +) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { set_column_by_primitive_type::(rows, row_count, array, col_name); @@ -474,7 +465,7 @@ fn set_column_for_json_rows( rows.iter_mut() .zip(listarr.iter()) .take(row_count) - .try_for_each(|(row, maybe_value)| -> Result<()> { + .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { if let Some(v) = maybe_value { row.insert( col_name.to_string(), @@ -489,7 +480,7 @@ fn set_column_for_json_rows( rows.iter_mut() .zip(listarr.iter()) .take(row_count) - .try_for_each(|(row, maybe_value)| -> Result<()> { + .try_for_each(|(row, maybe_value)| -> Result<(), ArrowError> { if let Some(v) = maybe_value { let val = array_to_json_array(&v)?; row.insert(col_name.to_string(), Value::Array(val)); @@ -499,7 +490,7 @@ fn set_column_for_json_rows( } DataType::Dictionary(_, value_type) => { let slice = array.slice(0, row_count); - let hydrated = crate::compute::kernels::cast::cast(&slice, value_type) + let hydrated = arrow_cast::cast::cast(&slice, value_type) .expect("cannot cast dictionary to underlying values"); set_column_for_json_rows(rows, row_count, &hydrated, col_name)?; } @@ -555,7 +546,7 @@ fn set_column_for_json_rows( /// [`JsonMap`]s (objects) pub fn record_batches_to_json_rows( batches: &[RecordBatch], -) -> Result>> { +) -> Result>, ArrowError> { let mut rows: Vec> = iter::repeat(JsonMap::new()) .take(batches.iter().map(|b| b.num_rows()).sum()) .collect(); @@ -581,24 +572,28 @@ pub fn record_batches_to_json_rows( pub trait JsonFormat: Debug + Default { #[inline] /// write any bytes needed at the start of the file to the writer - fn start_stream(&self, _writer: &mut W) -> Result<()> { + fn start_stream(&self, _writer: &mut W) -> Result<(), ArrowError> { Ok(()) } #[inline] /// write any bytes needed for the start of each row - fn start_row(&self, _writer: &mut W, _is_first_row: bool) -> Result<()> { + fn start_row( + &self, + _writer: &mut W, + _is_first_row: bool, + ) -> Result<(), ArrowError> { Ok(()) } #[inline] /// write any bytes needed for the end of each row - fn end_row(&self, _writer: &mut W) -> Result<()> { + fn end_row(&self, _writer: &mut W) -> Result<(), ArrowError> { Ok(()) } /// write any bytes needed for the start of each row - fn end_stream(&self, _writer: &mut W) -> Result<()> { + fn end_stream(&self, _writer: &mut W) -> Result<(), ArrowError> { Ok(()) } } @@ -614,7 +609,7 @@ pub trait JsonFormat: Debug + Default { pub struct LineDelimited {} impl JsonFormat for LineDelimited { - fn end_row(&self, writer: &mut W) -> Result<()> { + fn end_row(&self, writer: &mut W) -> Result<(), ArrowError> { writer.write_all(b"\n")?; Ok(()) } @@ -629,19 +624,23 @@ impl JsonFormat for LineDelimited { pub struct JsonArray {} impl JsonFormat for JsonArray { - fn start_stream(&self, writer: &mut W) -> Result<()> { + fn start_stream(&self, writer: &mut W) -> Result<(), ArrowError> { writer.write_all(b"[")?; Ok(()) } - fn start_row(&self, writer: &mut W, is_first_row: bool) -> Result<()> { + fn start_row( + &self, + writer: &mut W, + is_first_row: bool, + ) -> Result<(), ArrowError> { if !is_first_row { writer.write_all(b",")?; } Ok(()) } - fn end_stream(&self, writer: &mut W) -> Result<()> { + fn end_stream(&self, writer: &mut W) -> Result<(), ArrowError> { writer.write_all(b"]")?; Ok(()) } @@ -692,7 +691,7 @@ where } /// Write a single JSON row to the output writer - pub fn write_row(&mut self, row: &Value) -> Result<()> { + pub fn write_row(&mut self, row: &Value) -> Result<(), ArrowError> { let is_first_row = !self.started; if !self.started { self.format.start_stream(&mut self.writer)?; @@ -709,7 +708,7 @@ where } /// Convert the `RecordBatch` into JSON rows, and write them to the output - pub fn write(&mut self, batch: RecordBatch) -> Result<()> { + pub fn write(&mut self, batch: RecordBatch) -> Result<(), ArrowError> { for row in record_batches_to_json_rows(&[batch])? { self.write_row(&Value::Object(row))?; } @@ -717,7 +716,7 @@ where } /// Convert the [`RecordBatch`] into JSON rows, and write them to the output - pub fn write_batches(&mut self, batches: &[RecordBatch]) -> Result<()> { + pub fn write_batches(&mut self, batches: &[RecordBatch]) -> Result<(), ArrowError> { for row in record_batches_to_json_rows(batches)? { self.write_row(&Value::Object(row))?; } @@ -727,7 +726,7 @@ where /// Finishes the output stream. This function must be called after /// all record batches have been produced. (e.g. producing the final `']'` if writing /// arrays. - pub fn finish(&mut self) -> Result<()> { + pub fn finish(&mut self) -> Result<(), ArrowError> { if self.started && !self.finished { self.format.end_stream(&mut self.writer)?; self.finished = true; @@ -743,15 +742,14 @@ where #[cfg(test)] mod tests { - use std::convert::TryFrom; use std::fs::{read_to_string, File}; use std::sync::Arc; + use crate::reader::*; + use arrow_buffer::{Buffer, ToByteSlice}; + use arrow_data::ArrayData; use serde_json::json; - use crate::buffer::*; - use crate::json::reader::*; - use super::*; /// Asserts that the NDJSON `input` is semantically identical to `expected` diff --git a/arrow/test/data/arrays.json b/arrow-json/test/data/arrays.json similarity index 100% rename from arrow/test/data/arrays.json rename to arrow-json/test/data/arrays.json diff --git a/arrow/test/data/basic.json b/arrow-json/test/data/basic.json similarity index 100% rename from arrow/test/data/basic.json rename to arrow-json/test/data/basic.json diff --git a/arrow/test/data/basic_nulls.json b/arrow-json/test/data/basic_nulls.json similarity index 100% rename from arrow/test/data/basic_nulls.json rename to arrow-json/test/data/basic_nulls.json diff --git a/arrow/test/data/list_string_dict_nested.json b/arrow-json/test/data/list_string_dict_nested.json similarity index 100% rename from arrow/test/data/list_string_dict_nested.json rename to arrow-json/test/data/list_string_dict_nested.json diff --git a/arrow/test/data/list_string_dict_nested_nulls.json b/arrow-json/test/data/list_string_dict_nested_nulls.json similarity index 100% rename from arrow/test/data/list_string_dict_nested_nulls.json rename to arrow-json/test/data/list_string_dict_nested_nulls.json diff --git a/arrow/test/data/mixed_arrays.json b/arrow-json/test/data/mixed_arrays.json similarity index 100% rename from arrow/test/data/mixed_arrays.json rename to arrow-json/test/data/mixed_arrays.json diff --git a/arrow/test/data/mixed_arrays.json.gz b/arrow-json/test/data/mixed_arrays.json.gz similarity index 100% rename from arrow/test/data/mixed_arrays.json.gz rename to arrow-json/test/data/mixed_arrays.json.gz diff --git a/arrow/test/data/nested_structs.json b/arrow-json/test/data/nested_structs.json similarity index 100% rename from arrow/test/data/nested_structs.json rename to arrow-json/test/data/nested_structs.json diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index cc9421de710..d5392673e29 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -44,16 +44,15 @@ ahash = { version = "0.8", default-features = false, features = ["compile-time-r ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } [dependencies] +arrow-array = { version = "26.0.0", path = "../arrow-array" } arrow-buffer = { version = "26.0.0", path = "../arrow-buffer" } arrow-cast = { version = "26.0.0", path = "../arrow-cast" } arrow-csv = { version = "26.0.0", path = "../arrow-csv", optional = true } arrow-data = { version = "26.0.0", path = "../arrow-data" } +arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", optional = true } +arrow-json = { version = "26.0.0", path = "../arrow-json", optional = true } arrow-schema = { version = "26.0.0", path = "../arrow-schema" } -arrow-array = { version = "26.0.0", path = "../arrow-array" } arrow-select = { version = "26.0.0", path = "../arrow-select" } -arrow-ipc = { version = "26.0.0", path = "../arrow-ipc", optional = true } -serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } -indexmap = { version = "1.9", default-features = false, features = ["std"] } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.1", default-features = false, features = ["num-traits"] } @@ -64,9 +63,8 @@ packed_simd = { version = "0.3", default-features = false, optional = true, pack chrono = { version = "0.4", default-features = false, features = ["clock"] } comfy-table = { version = "6.0", optional = true, default-features = false } pyo3 = { version = "0.17", default-features = false, optional = true } -lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } multiversion = { version = "0.6.1", default-features = false } -bitflags = { version = "1.2.1", default-features = false } +bitflags = { version = "1.2.1", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "dyn_cmp_dict", "ffi", "pyarrow"] @@ -76,7 +74,7 @@ default = ["csv", "ipc", "json"] ipc_compression = ["ipc", "arrow-ipc/lz4", "arrow-ipc/zstd"] csv = ["arrow-csv"] ipc = ["arrow-ipc"] -json = ["serde_json"] +json = ["arrow-json"] simd = ["packed_simd"] prettyprint = ["comfy-table"] # The test utils feature enables code used in benchmarks and tests but @@ -90,7 +88,7 @@ pyarrow = ["pyo3", "ffi"] # but is run as part of our CI checks force_validate = ["arrow-data/force_validate"] # Enable ffi support -ffi = [] +ffi = ["bitflags"] # Enable dyn-comparison of dictionary arrays with other arrays # Note: this does not impact comparison against scalars dyn_cmp_dict = [] @@ -100,9 +98,9 @@ dyn_arith_dict = [] chrono-tz = ["arrow-array/chrono-tz"] [dev-dependencies] -rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +chrono = { version = "0.4", default-features = false, features = ["clock"] } criterion = { version = "0.4", default-features = false } -flate2 = { version = "1", default-features = false, features = ["rust_backend"] } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } tempfile = { version = "3", default-features = false } [build-dependencies] diff --git a/arrow/benches/json_reader.rs b/arrow/benches/json_reader.rs index ef3ddf0537b..7bc3f4179fe 100644 --- a/arrow/benches/json_reader.rs +++ b/arrow/benches/json_reader.rs @@ -15,13 +15,10 @@ // specific language governing permissions and limitations // under the License. -extern crate arrow; -extern crate criterion; - use criterion::*; use arrow::datatypes::*; -use arrow::json::ReaderBuilder; +use arrow_json::ReaderBuilder; use std::io::Cursor; use std::sync::Arc; diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index d1e0095840a..1b2ff0684a6 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -34,7 +34,9 @@ //! * [`arrow-array`][arrow_array] - type-safe arrow array abstractions //! * [`arrow-buffer`][arrow_buffer] - buffer abstractions for arrow arrays //! * [`arrow-cast`][arrow_cast] - cast kernels for arrow arrays +//! * [`arrow-csv`][arrow_csv] - read/write CSV to arrow format //! * [`arrow-data`][arrow_data] - the underlying data of arrow arrays +//! * [`arrow-json`][arrow_json] - read/write JSON to arrow format //! * [`arrow-schema`][arrow_schema] - the logical types for arrow arrays //! * [`arrow-select`][arrow_select] - selection kernels for arrow arrays //! @@ -315,8 +317,8 @@ pub mod ffi; pub mod ffi_stream; #[cfg(feature = "ipc")] pub use arrow_ipc as ipc; -#[cfg(feature = "serde_json")] -pub mod json; +#[cfg(feature = "json")] +pub use arrow_json as json; #[cfg(feature = "pyarrow")] pub mod pyarrow; diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index bafee11edb7..0ca2ab91a5e 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -3,6 +3,7 @@ testing/* target/* dev/release/rat_exclude_files.txt arrow/test/data/* +arrow-json/test/data/* arrow/test/dependency/* arrow-integration-test/data/* parquet_derive/test/dependency/*